From 0fe505ef10353bc37383358cf4a067580b7224d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ce=20Ge=20=28=E6=88=88=E7=AD=96=29?= Date: Thu, 5 Dec 2024 20:44:00 +0800 Subject: [PATCH] add python_lambda_mapper (#492) * init python_lambda_mapper * set default arg * fix init * support batched & add docs * fix docs * Quick fix for some minor problems (#503) * * remove str conversion for fps para of add_stream func + add requires from librosa to avoid lazy_loader failure during multiprocessing * * remove str conversion for fps para of add_stream func + add requires from librosa to avoid lazy_loader failure during multiprocessing * * install cmake before * * install cmake before * * install cmake before * * update unit test tags * * update unit test tags * * update unit test tags * * update unit test tags * * try to remove samplerate dep * * skip audio duration and audio nmf snr filters * * skip video_tagging_from_frames_filter * * skip video_tagging_from_audios_filter * * skip video_motion_score_raft_filter * fix batch bug (#504) * fix batch bug * fix filter batch * not rank for filter * limit pyav version --------- Co-authored-by: Yilun Huang Co-authored-by: BeachWang <1400012807@pku.edu.cn> --- configs/config_all.yaml | 3 + data_juicer/ops/mapper/__init__.py | 27 +++---- .../ops/mapper/python_lambda_mapper.py | 74 +++++++++++++++++++ docs/Operators.md | 3 +- docs/Operators_ZH.md | 3 +- tests/ops/mapper/test_python_lambda_mapper.py | 68 +++++++++++++++++ 6 files changed, 163 insertions(+), 15 deletions(-) create mode 100644 data_juicer/ops/mapper/python_lambda_mapper.py create mode 100644 tests/ops/mapper/test_python_lambda_mapper.py diff --git a/configs/config_all.yaml b/configs/config_all.yaml index ea10be519..f9a0b943d 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -257,6 +257,9 @@ process: model_params: {} # Parameters for initializing the API model. sampling_params: {} # Extra parameters passed to the API call. - punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations. + - python_lambda_mapper: # executing Python lambda function on data samples. + lambda_str: '' # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used. + batched: False # A boolean indicating whether to process input data in batches. - remove_bibliography_mapper: # remove bibliography from Latex text. - remove_comments_mapper: # remove comments from Latex text, code, etc. doc_type: tex # comment type you want to remove. Only support 'tex' for now. diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index db4f54e10..863167a6a 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -30,6 +30,7 @@ from .optimize_response_mapper import OptimizeResponseMapper from .pair_preference_mapper import PairPreferenceMapper from .punctuation_normalization_mapper import PunctuationNormalizationMapper +from .python_lambda_mapper import PythonLambdaMapper from .remove_bibliography_mapper import RemoveBibliographyMapper from .remove_comments_mapper import RemoveCommentsMapper from .remove_header_mapper import RemoveHeaderMapper @@ -75,17 +76,17 @@ 'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper', 'PairPreferenceMapper', 'PunctuationNormalizationMapper', - 'RemoveBibliographyMapper', 'RemoveCommentsMapper', 'RemoveHeaderMapper', - 'RemoveLongWordsMapper', 'RemoveNonChineseCharacterlMapper', - 'RemoveRepeatSentencesMapper', 'RemoveSpecificCharsMapper', - 'RemoveTableTextMapper', 'RemoveWordsWithIncorrectSubstringsMapper', - 'ReplaceContentMapper', 'SentenceSplitMapper', 'TextChunkMapper', - 'VideoCaptioningFromAudioMapper', 'VideoCaptioningFromFramesMapper', - 'VideoCaptioningFromSummarizerMapper', 'VideoCaptioningFromVideoMapper', - 'VideoFFmpegWrappedMapper', 'VideoFaceBlurMapper', - 'VideoRemoveWatermarkMapper', 'VideoResizeAspectRatioMapper', - 'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper', - 'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper', - 'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper', - 'WhitespaceNormalizationMapper' + 'PythonLambdaMapper', 'RemoveBibliographyMapper', 'RemoveCommentsMapper', + 'RemoveHeaderMapper', 'RemoveLongWordsMapper', + 'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper', + 'RemoveSpecificCharsMapper', 'RemoveTableTextMapper', + 'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper', + 'SentenceSplitMapper', 'TextChunkMapper', 'VideoCaptioningFromAudioMapper', + 'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper', + 'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper', + 'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper', + 'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper', + 'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper', + 'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper', + 'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper' ] diff --git a/data_juicer/ops/mapper/python_lambda_mapper.py b/data_juicer/ops/mapper/python_lambda_mapper.py new file mode 100644 index 000000000..e90c77f48 --- /dev/null +++ b/data_juicer/ops/mapper/python_lambda_mapper.py @@ -0,0 +1,74 @@ +import ast + +from ..base_op import OPERATORS, Mapper + +OP_NAME = 'python_lambda_mapper' + + +@OPERATORS.register_module(OP_NAME) +class PythonLambdaMapper(Mapper): + """Mapper for executing Python lambda function on data samples.""" + + def __init__(self, lambda_str: str = '', batched: bool = False, **kwargs): + """ + Initialization method. + + :param lambda_str: A string representation of the lambda function to be + executed on data samples. If empty, the identity function is used. + :param batched: A boolean indicating whether to process input data in + batches. + :param kwargs: Additional keyword arguments passed to the parent class. + """ + self._batched_op = bool(batched) + super().__init__(**kwargs) + + # Parse and validate the lambda function + if not lambda_str: + self.lambda_func = lambda sample: sample + else: + self.lambda_func = self._create_lambda(lambda_str) + + def _create_lambda(self, lambda_str: str): + # Parse input string into an AST and check for a valid lambda function + try: + node = ast.parse(lambda_str, mode='eval') + + # Check if the body of the expression is a lambda + if not isinstance(node.body, ast.Lambda): + raise ValueError( + 'Input string must be a valid lambda function.') + + # Check that the lambda has exactly one argument + if len(node.body.args.args) != 1: + raise ValueError( + 'Lambda function must have exactly one argument.') + + # Compile the AST to code + compiled_code = compile(node, '', 'eval') + # Safely evaluate the compiled code allowing built-in functions + func = eval(compiled_code, {'__builtins__': __builtins__}) + return func + except Exception as e: + raise ValueError(f'Invalid lambda function: {e}') + + def process_single(self, sample): + # Process the input through the lambda function and return the result + result = self.lambda_func(sample) + + # Check if the result is a valid + if not isinstance(result, dict): + raise ValueError(f'Lambda function must return a dictionary, ' + f'got {type(result).__name__} instead.') + + return result + + def process_batched(self, samples): + # Process the input through the lambda function and return the result + result = self.lambda_func(samples) + + # Check if the result is a valid + if not isinstance(result, dict): + raise ValueError(f'Lambda function must return a dictionary, ' + f'got {type(result).__name__} instead.') + + return result diff --git a/docs/Operators.md b/docs/Operators.md index f24523dc5..218f883be 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types. | Type | Number | Description | |-----------------------------------|:------:|-------------------------------------------------| | [ Formatter ]( #formatter ) | 9 | Discovers, loads, and canonicalizes source data | -| [ Mapper ]( #mapper ) | 59 | Edits and transforms samples | +| [ Mapper ]( #mapper ) | 60 | Edits and transforms samples | | [ Filter ]( #filter ) | 44 | Filters out low-quality samples | | [ Deduplicator ]( #deduplicator ) | 8 | Detects and removes duplicate samples | | [ Selector ]( #selector ) | 4 | Selects top samples based on ranking | @@ -88,6 +88,7 @@ All the specific operators are listed below, each featured with several capabili | optimize_response_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | Optimize the response in question-answering samples. | [code](../data_juicer/ops/mapper/optimize_response_mapper.py) | [tests](../tests/ops/mapper/test_optimize_response_mapper.py) | | pair_preference_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Construct paired preference samples. | [code](../data_juicer/ops/mapper/pair_preference_mapper.py) | [tests](../tests/ops/mapper/test_pair_preference_mapper.py) | | punctuation_normalization_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Normalizes various Unicode punctuations to their ASCII equivalents | [code](../data_juicer/ops/mapper/punctuation_normalization_mapper.py) | [tests](../tests/ops/mapper/test_punctuation_normalization_mapper.py) | +| python_lambda_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Executing Python lambda function on data samples | [code](../data_juicer/ops/mapper/python_lambda_mapper.py) | [tests](../tests/ops/mapper/test_python_lambda_mapper.py) | | remove_bibliography_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Removes the bibliography of TeX documents | [code](../data_juicer/ops/mapper/remove_bibliography_mapper.py) | [tests](../tests/ops/mapper/test_remove_bibliography_mapper.py) | | remove_comments_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Removes the comments of TeX documents | [code](../data_juicer/ops/mapper/remove_comments_mapper.py) | [tests](../tests/ops/mapper/test_remove_comments_mapper.py) | | remove_header_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names | [code](../data_juicer/ops/mapper/remove_header_mapper.py) | [tests](../tests/ops/mapper/test_remove_header_mapper.py) | diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md index c771a30e9..6a37a0ac9 100644 --- a/docs/Operators_ZH.md +++ b/docs/Operators_ZH.md @@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。 | 类型 | 数量 | 描述 | |------------------------------------|:--:|---------------| | [ Formatter ]( #formatter ) | 9 | 发现、加载、规范化原始数据 | -| [ Mapper ]( #mapper ) | 59 | 对数据样本进行编辑和转换 | +| [ Mapper ]( #mapper ) | 60 | 对数据样本进行编辑和转换 | | [ Filter ]( #filter ) | 44 | 过滤低质量样本 | | [ Deduplicator ]( #deduplicator ) | 8 | 识别、删除重复样本 | | [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 | @@ -87,6 +87,7 @@ Data-Juicer 中的算子分为以下 5 种类型。 | optimize_response_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 指令优化,优化 response | [code](../data_juicer/ops/mapper/optimize_response_mapper.py) | [tests](../tests/ops/mapper/test_optimize_response_mapper.py) | | pair_preference_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 构造配对的偏好样本 | [code](../data_juicer/ops/mapper/pair_preference_mapper.py) | [tests](../tests/ops/mapper/test_pair_preference_mapper.py) | | punctuation_normalization_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 将各种 Unicode 标点符号标准化为其 ASCII 等效项 | [code](../data_juicer/ops/mapper/punctuation_normalization_mapper.py) | [tests](../tests/ops/mapper/test_punctuation_normalization_mapper.py) | +| python_lambda_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 执行 Python lambda 函数处理样本 | [code](../data_juicer/ops/mapper/python_lambda_mapper.py) | [tests](../tests/ops/mapper/test_python_lambda_mapper.py) | | remove_bibliography_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 删除 TeX 文档的参考文献 | [code](../data_juicer/ops/mapper/remove_bibliography_mapper.py) | [tests](../tests/ops/mapper/test_remove_bibliography_mapper.py) | | remove_comments_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 删除 TeX 文档中的注释 | [code](../data_juicer/ops/mapper/remove_comments_mapper.py) | [tests](../tests/ops/mapper/test_remove_comments_mapper.py) | | remove_header_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 删除 TeX 文档头,例如标题、章节数字/名称等 | [code](../data_juicer/ops/mapper/remove_header_mapper.py) | [tests](../tests/ops/mapper/test_remove_header_mapper.py) | diff --git a/tests/ops/mapper/test_python_lambda_mapper.py b/tests/ops/mapper/test_python_lambda_mapper.py new file mode 100644 index 000000000..97fac4794 --- /dev/null +++ b/tests/ops/mapper/test_python_lambda_mapper.py @@ -0,0 +1,68 @@ +import unittest + +from data_juicer.ops.mapper.python_lambda_mapper import PythonLambdaMapper +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + +class PythonLambdaMapperMapper(DataJuicerTestCaseBase): + + def test_lambda_function_batched(self): + mapper = PythonLambdaMapper("lambda d: {'value': d['value'] + [6]}", batched=True) # Append '6' to value + result = mapper.process_batched({'value': [5]}) + self.assertEqual(result, {'value': [5, 6]}) + + def test_lambda_modifies_values(self): + mapper = PythonLambdaMapper("lambda d: {'value': d['value'] + 1}") # '+1' to 'value' + result = mapper.process_single({'value': 5}) + self.assertEqual(result, {'value': 6}) + + def test_lambda_combines_values(self): + mapper = PythonLambdaMapper("lambda d: {'combined': d['a'] + d['b']}") + result = mapper.process_single({'a': 3, 'b': 7}) + self.assertEqual(result, {'combined': 10}) + + def test_lambda_swaps_values(self): + mapper = PythonLambdaMapper("lambda d: {'a': d['b'], 'b': d['a']}") + result = mapper.process_single({'a': 1, 'b': 2}) + self.assertEqual(result, {'a': 2, 'b': 1}) + + def test_lambda_result_is_not_dict(self): + mapper = PythonLambdaMapper("lambda d: d['value'] + 1") # This returns an int + with self.assertRaises(ValueError) as cm: + mapper.process_single({'value': 10}) + self.assertIn("Lambda function must return a dictionary, got int instead.", str(cm.exception)) + + def test_invalid_syntax(self): + with self.assertRaises(ValueError) as cm: + PythonLambdaMapper("invalid lambda") # Invalid syntax + self.assertIn("Invalid lambda function", str(cm.exception)) + + def test_invalid_expression(self): + with self.assertRaises(ValueError) as cm: + PythonLambdaMapper("3 + 5") # Not a lambda + self.assertIn("Input string must be a valid lambda function.", str(cm.exception)) + + def test_lambda_with_multiple_arguments(self): + with self.assertRaises(ValueError) as cm: + PythonLambdaMapper("lambda x, y: {'sum': x + y}") # Creating a lambda accepts two arguments + self.assertIn("Lambda function must have exactly one argument.", str(cm.exception)) + + def test_lambda_returning_unexpected_structure(self): + mapper = PythonLambdaMapper("lambda d: ({'value': d['value']}, {'extra': d['extra']})") # Invalid return type; too many dictionaries + with self.assertRaises(ValueError) as cm: + mapper.process_single({'value': 5, 'extra': 10}) + self.assertIn("Lambda function must return a dictionary, got tuple instead.", str(cm.exception)) + + def test_lambda_modifies_in_place_and_returns(self): + mapper = PythonLambdaMapper("lambda d: d.update({'new_key': 'added_value'}) or d") # Returns the modified dictionary + sample_dict = {'value': 3} + result = mapper.process_single(sample_dict) + self.assertEqual(result, {'value': 3, 'new_key': 'added_value'}) # Ensure the update worked + + def test_lambda_function_with_no_operation(self): + mapper = PythonLambdaMapper("lambda d: d") # Simply returns the input dictionary + sample_dict = {'key': 'value'} + result = mapper.process_single(sample_dict) + self.assertEqual(result, {'key': 'value'}) # Unchanged + +if __name__ == '__main__': + unittest.main() \ No newline at end of file