From 0fe505ef10353bc37383358cf4a067580b7224d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ce=20Ge=20=28=E6=88=88=E7=AD=96=29?= <gece@foxmail.com>
Date: Thu, 5 Dec 2024 20:44:00 +0800
Subject: [PATCH] add python_lambda_mapper (#492)

* init python_lambda_mapper

* set default arg

* fix init

* support batched & add docs

* fix docs

* Quick fix for some minor problems (#503)

* * remove str conversion for fps para of add_stream func
+ add requires from librosa to avoid lazy_loader failure during multiprocessing

* * remove str conversion for fps para of add_stream func
+ add requires from librosa to avoid lazy_loader failure during multiprocessing

* * install cmake before

* * install cmake before

* * install cmake before

* * update unit test tags

* * update unit test tags

* * update unit test tags

* * update unit test tags

* * try to remove samplerate dep

* * skip audio duration and audio nmf snr filters

* * skip video_tagging_from_frames_filter

* * skip video_tagging_from_audios_filter

* * skip video_motion_score_raft_filter

* fix batch bug (#504)

* fix batch bug

* fix filter batch

* not rank for filter

* limit pyav version

---------

Co-authored-by: Yilun Huang <lielin.hyl@alibaba-inc.com>
Co-authored-by: BeachWang <1400012807@pku.edu.cn>
---
 configs/config_all.yaml                       |  3 +
 data_juicer/ops/mapper/__init__.py            | 27 +++----
 .../ops/mapper/python_lambda_mapper.py        | 74 +++++++++++++++++++
 docs/Operators.md                             |  3 +-
 docs/Operators_ZH.md                          |  3 +-
 tests/ops/mapper/test_python_lambda_mapper.py | 68 +++++++++++++++++
 6 files changed, 163 insertions(+), 15 deletions(-)
 create mode 100644 data_juicer/ops/mapper/python_lambda_mapper.py
 create mode 100644 tests/ops/mapper/test_python_lambda_mapper.py

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index ea10be519..f9a0b943d 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -257,6 +257,9 @@ process:
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call.
   - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
+  - python_lambda_mapper:                                   # executing Python lambda function on data samples.
+      lambda_str: ''                                          # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
+      batched: False                                          # A boolean indicating whether to process input data in batches.
   - remove_bibliography_mapper:                             # remove bibliography from Latex text.
   - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.
       doc_type: tex                                           # comment type you want to remove. Only support 'tex' for now.
diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
index db4f54e10..863167a6a 100644
--- a/data_juicer/ops/mapper/__init__.py
+++ b/data_juicer/ops/mapper/__init__.py
@@ -30,6 +30,7 @@
 from .optimize_response_mapper import OptimizeResponseMapper
 from .pair_preference_mapper import PairPreferenceMapper
 from .punctuation_normalization_mapper import PunctuationNormalizationMapper
+from .python_lambda_mapper import PythonLambdaMapper
 from .remove_bibliography_mapper import RemoveBibliographyMapper
 from .remove_comments_mapper import RemoveCommentsMapper
 from .remove_header_mapper import RemoveHeaderMapper
@@ -75,17 +76,17 @@
     'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper',
     'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper',
     'PairPreferenceMapper', 'PunctuationNormalizationMapper',
-    'RemoveBibliographyMapper', 'RemoveCommentsMapper', 'RemoveHeaderMapper',
-    'RemoveLongWordsMapper', 'RemoveNonChineseCharacterlMapper',
-    'RemoveRepeatSentencesMapper', 'RemoveSpecificCharsMapper',
-    'RemoveTableTextMapper', 'RemoveWordsWithIncorrectSubstringsMapper',
-    'ReplaceContentMapper', 'SentenceSplitMapper', 'TextChunkMapper',
-    'VideoCaptioningFromAudioMapper', 'VideoCaptioningFromFramesMapper',
-    'VideoCaptioningFromSummarizerMapper', 'VideoCaptioningFromVideoMapper',
-    'VideoFFmpegWrappedMapper', 'VideoFaceBlurMapper',
-    'VideoRemoveWatermarkMapper', 'VideoResizeAspectRatioMapper',
-    'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper',
-    'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper',
-    'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper',
-    'WhitespaceNormalizationMapper'
+    'PythonLambdaMapper', 'RemoveBibliographyMapper', 'RemoveCommentsMapper',
+    'RemoveHeaderMapper', 'RemoveLongWordsMapper',
+    'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
+    'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
+    'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
+    'SentenceSplitMapper', 'TextChunkMapper', 'VideoCaptioningFromAudioMapper',
+    'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
+    'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
+    'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
+    'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper',
+    'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper',
+    'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper',
+    'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper'
 ]
diff --git a/data_juicer/ops/mapper/python_lambda_mapper.py b/data_juicer/ops/mapper/python_lambda_mapper.py
new file mode 100644
index 000000000..e90c77f48
--- /dev/null
+++ b/data_juicer/ops/mapper/python_lambda_mapper.py
@@ -0,0 +1,74 @@
+import ast
+
+from ..base_op import OPERATORS, Mapper
+
+OP_NAME = 'python_lambda_mapper'
+
+
+@OPERATORS.register_module(OP_NAME)
+class PythonLambdaMapper(Mapper):
+    """Mapper for executing Python lambda function on data samples."""
+
+    def __init__(self, lambda_str: str = '', batched: bool = False, **kwargs):
+        """
+        Initialization method.
+
+        :param lambda_str: A string representation of the lambda function to be
+            executed on data samples. If empty, the identity function is used.
+        :param batched: A boolean indicating whether to process input data in
+            batches.
+        :param kwargs: Additional keyword arguments passed to the parent class.
+        """
+        self._batched_op = bool(batched)
+        super().__init__(**kwargs)
+
+        # Parse and validate the lambda function
+        if not lambda_str:
+            self.lambda_func = lambda sample: sample
+        else:
+            self.lambda_func = self._create_lambda(lambda_str)
+
+    def _create_lambda(self, lambda_str: str):
+        # Parse input string into an AST and check for a valid lambda function
+        try:
+            node = ast.parse(lambda_str, mode='eval')
+
+            # Check if the body of the expression is a lambda
+            if not isinstance(node.body, ast.Lambda):
+                raise ValueError(
+                    'Input string must be a valid lambda function.')
+
+            # Check that the lambda has exactly one argument
+            if len(node.body.args.args) != 1:
+                raise ValueError(
+                    'Lambda function must have exactly one argument.')
+
+            # Compile the AST to code
+            compiled_code = compile(node, '<string>', 'eval')
+            # Safely evaluate the compiled code allowing built-in functions
+            func = eval(compiled_code, {'__builtins__': __builtins__})
+            return func
+        except Exception as e:
+            raise ValueError(f'Invalid lambda function: {e}')
+
+    def process_single(self, sample):
+        # Process the input through the lambda function and return the result
+        result = self.lambda_func(sample)
+
+        # Check if the result is a valid
+        if not isinstance(result, dict):
+            raise ValueError(f'Lambda function must return a dictionary, '
+                             f'got {type(result).__name__} instead.')
+
+        return result
+
+    def process_batched(self, samples):
+        # Process the input through the lambda function and return the result
+        result = self.lambda_func(samples)
+
+        # Check if the result is a valid
+        if not isinstance(result, dict):
+            raise ValueError(f'Lambda function must return a dictionary, '
+                             f'got {type(result).__name__} instead.')
+
+        return result
diff --git a/docs/Operators.md b/docs/Operators.md
index f24523dc5..218f883be 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
 | Type                              | Number | Description                                     |
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   9    | Discovers, loads, and canonicalizes source data |
-| [ Mapper ]( #mapper )             |   59   | Edits and transforms samples                    |
+| [ Mapper ]( #mapper )             |   60   | Edits and transforms samples                    |
 | [ Filter ]( #filter )             |   44   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   8    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   4    | Selects top samples based on ranking            |
@@ -88,6 +88,7 @@ All the specific operators are listed below, each featured with several capabili
 | optimize_response_mapper                       | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)   | Optimize the response in question-answering samples.                                                                                                                           | [code](../data_juicer/ops/mapper/optimize_response_mapper.py)                      | [tests](../tests/ops/mapper/test_optimize_response_mapper.py)                      |
 | pair_preference_mapper                         | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | Construct paired preference samples.                                                                                                                                           | [code](../data_juicer/ops/mapper/pair_preference_mapper.py)                        | [tests](../tests/ops/mapper/test_pair_preference_mapper.py)                        |
 | punctuation_normalization_mapper               | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | Normalizes various Unicode punctuations to their ASCII equivalents                                                                                                             | [code](../data_juicer/ops/mapper/punctuation_normalization_mapper.py)              | [tests](../tests/ops/mapper/test_punctuation_normalization_mapper.py)              |
+| python_lambda_mapper                           | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | Executing Python lambda function on data samples                                                                                                                               | [code](../data_juicer/ops/mapper/python_lambda_mapper.py)                          | [tests](../tests/ops/mapper/test_python_lambda_mapper.py)                          |
 | remove_bibliography_mapper                     | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | Removes the bibliography of TeX documents                                                                                                                                      | [code](../data_juicer/ops/mapper/remove_bibliography_mapper.py)                    | [tests](../tests/ops/mapper/test_remove_bibliography_mapper.py)                    |
 | remove_comments_mapper                         | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | Removes the comments of TeX documents                                                                                                                                          | [code](../data_juicer/ops/mapper/remove_comments_mapper.py)                        | [tests](../tests/ops/mapper/test_remove_comments_mapper.py)                        |
 | remove_header_mapper                           | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names                                                                                   | [code](../data_juicer/ops/mapper/remove_header_mapper.py)                          | [tests](../tests/ops/mapper/test_remove_header_mapper.py)                          |
diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
index c771a30e9..6a37a0ac9 100644
--- a/docs/Operators_ZH.md
+++ b/docs/Operators_ZH.md
@@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | 类型                                | 数量 | 描述            |
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  9 | 发现、加载、规范化原始数据 |
-| [ Mapper ]( #mapper )              | 59 | 对数据样本进行编辑和转换  |
+| [ Mapper ]( #mapper )              | 60 | 对数据样本进行编辑和转换  |
 | [ Filter ]( #filter )              | 44 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  8 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  4 | 基于排序选取高质量样本   |
@@ -87,6 +87,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | optimize_response_mapper                       | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)   | 指令优化，优化 response                                                                                              | [code](../data_juicer/ops/mapper/optimize_response_mapper.py)                      | [tests](../tests/ops/mapper/test_optimize_response_mapper.py)                      |
 | pair_preference_mapper                         | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | 构造配对的偏好样本                                                                                                   | [code](../data_juicer/ops/mapper/pair_preference_mapper.py)                        | [tests](../tests/ops/mapper/test_pair_preference_mapper.py)                        |
 | punctuation_normalization_mapper               | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | 将各种 Unicode 标点符号标准化为其 ASCII 等效项                                                                       | [code](../data_juicer/ops/mapper/punctuation_normalization_mapper.py)              | [tests](../tests/ops/mapper/test_punctuation_normalization_mapper.py)              |
+| python_lambda_mapper                           | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | 执行 Python lambda 函数处理样本                                                                                  | [code](../data_juicer/ops/mapper/python_lambda_mapper.py)                          | [tests](../tests/ops/mapper/test_python_lambda_mapper.py)                          |
 | remove_bibliography_mapper                     | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 删除 TeX 文档的参考文献                                                                                              | [code](../data_juicer/ops/mapper/remove_bibliography_mapper.py)                    | [tests](../tests/ops/mapper/test_remove_bibliography_mapper.py)                    |
 | remove_comments_mapper                         | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 删除 TeX 文档中的注释                                                                                                | [code](../data_juicer/ops/mapper/remove_comments_mapper.py)                        | [tests](../tests/ops/mapper/test_remove_comments_mapper.py)                        |
 | remove_header_mapper                           | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 删除 TeX 文档头，例如标题、章节数字/名称等                                                                           | [code](../data_juicer/ops/mapper/remove_header_mapper.py)                          | [tests](../tests/ops/mapper/test_remove_header_mapper.py)                          |
diff --git a/tests/ops/mapper/test_python_lambda_mapper.py b/tests/ops/mapper/test_python_lambda_mapper.py
new file mode 100644
index 000000000..97fac4794
--- /dev/null
+++ b/tests/ops/mapper/test_python_lambda_mapper.py
@@ -0,0 +1,68 @@
+import unittest
+
+from data_juicer.ops.mapper.python_lambda_mapper import PythonLambdaMapper
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+class PythonLambdaMapperMapper(DataJuicerTestCaseBase):
+
+    def test_lambda_function_batched(self):
+        mapper = PythonLambdaMapper("lambda d: {'value': d['value'] + [6]}", batched=True)  # Append '6' to value
+        result = mapper.process_batched({'value': [5]})
+        self.assertEqual(result, {'value': [5, 6]})
+
+    def test_lambda_modifies_values(self):
+        mapper = PythonLambdaMapper("lambda d: {'value': d['value'] + 1}")  # '+1' to 'value'
+        result = mapper.process_single({'value': 5})
+        self.assertEqual(result, {'value': 6})
+
+    def test_lambda_combines_values(self):
+        mapper = PythonLambdaMapper("lambda d: {'combined': d['a'] + d['b']}")
+        result = mapper.process_single({'a': 3, 'b': 7})
+        self.assertEqual(result, {'combined': 10})
+
+    def test_lambda_swaps_values(self):
+        mapper = PythonLambdaMapper("lambda d: {'a': d['b'], 'b': d['a']}")
+        result = mapper.process_single({'a': 1, 'b': 2})
+        self.assertEqual(result, {'a': 2, 'b': 1})
+
+    def test_lambda_result_is_not_dict(self):
+        mapper = PythonLambdaMapper("lambda d: d['value'] + 1")  # This returns an int
+        with self.assertRaises(ValueError) as cm:
+            mapper.process_single({'value': 10})
+        self.assertIn("Lambda function must return a dictionary, got int instead.", str(cm.exception))
+
+    def test_invalid_syntax(self):
+        with self.assertRaises(ValueError) as cm:
+            PythonLambdaMapper("invalid lambda")  # Invalid syntax
+        self.assertIn("Invalid lambda function", str(cm.exception))
+
+    def test_invalid_expression(self):
+        with self.assertRaises(ValueError) as cm:
+            PythonLambdaMapper("3 + 5")  # Not a lambda
+        self.assertIn("Input string must be a valid lambda function.", str(cm.exception))
+
+    def test_lambda_with_multiple_arguments(self):
+        with self.assertRaises(ValueError) as cm:
+            PythonLambdaMapper("lambda x, y: {'sum': x + y}")  # Creating a lambda accepts two arguments
+        self.assertIn("Lambda function must have exactly one argument.", str(cm.exception))
+
+    def test_lambda_returning_unexpected_structure(self):
+        mapper = PythonLambdaMapper("lambda d: ({'value': d['value']}, {'extra': d['extra']})")  # Invalid return type; too many dictionaries
+        with self.assertRaises(ValueError) as cm:
+            mapper.process_single({'value': 5, 'extra': 10})
+        self.assertIn("Lambda function must return a dictionary, got tuple instead.", str(cm.exception))
+
+    def test_lambda_modifies_in_place_and_returns(self):
+        mapper = PythonLambdaMapper("lambda d: d.update({'new_key': 'added_value'}) or d")  # Returns the modified dictionary
+        sample_dict = {'value': 3}
+        result = mapper.process_single(sample_dict)
+        self.assertEqual(result, {'value': 3, 'new_key': 'added_value'})  # Ensure the update worked
+
+    def test_lambda_function_with_no_operation(self):
+        mapper = PythonLambdaMapper("lambda d: d")  # Simply returns the input dictionary
+        sample_dict = {'key': 'value'}
+        result = mapper.process_single(sample_dict)
+        self.assertEqual(result, {'key': 'value'})  # Unchanged
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file