diff --git a/.doctrees/data_juicer.analysis.doctree b/.doctrees/data_juicer.analysis.doctree index d0e773001..177be1c3d 100644 Binary files a/.doctrees/data_juicer.analysis.doctree and b/.doctrees/data_juicer.analysis.doctree differ diff --git a/.doctrees/data_juicer.core.doctree b/.doctrees/data_juicer.core.doctree index 3dd2879d4..30eb6af9a 100644 Binary files a/.doctrees/data_juicer.core.doctree and b/.doctrees/data_juicer.core.doctree differ diff --git a/.doctrees/data_juicer.format.doctree b/.doctrees/data_juicer.format.doctree index 0d369e7af..a0e703738 100644 Binary files a/.doctrees/data_juicer.format.doctree and b/.doctrees/data_juicer.format.doctree differ diff --git a/.doctrees/data_juicer.ops.common.doctree b/.doctrees/data_juicer.ops.common.doctree index 3b3c006f6..bc2074b92 100644 Binary files a/.doctrees/data_juicer.ops.common.doctree and b/.doctrees/data_juicer.ops.common.doctree differ diff --git a/.doctrees/data_juicer.ops.deduplicator.doctree b/.doctrees/data_juicer.ops.deduplicator.doctree index e189764b6..304898271 100644 Binary files a/.doctrees/data_juicer.ops.deduplicator.doctree and b/.doctrees/data_juicer.ops.deduplicator.doctree differ diff --git a/.doctrees/data_juicer.ops.doctree b/.doctrees/data_juicer.ops.doctree index d0f37495a..3ace95e38 100644 Binary files a/.doctrees/data_juicer.ops.doctree and b/.doctrees/data_juicer.ops.doctree differ diff --git a/.doctrees/data_juicer.ops.filter.doctree b/.doctrees/data_juicer.ops.filter.doctree index bb67dc4fa..193ef3ce9 100644 Binary files a/.doctrees/data_juicer.ops.filter.doctree and b/.doctrees/data_juicer.ops.filter.doctree differ diff --git a/.doctrees/data_juicer.ops.mapper.doctree b/.doctrees/data_juicer.ops.mapper.doctree index 7f796dd8b..35a03ff5b 100644 Binary files a/.doctrees/data_juicer.ops.mapper.doctree and b/.doctrees/data_juicer.ops.mapper.doctree differ diff --git a/.doctrees/data_juicer.ops.selector.doctree b/.doctrees/data_juicer.ops.selector.doctree index cce2ba322..a6c36d698 100644 Binary files a/.doctrees/data_juicer.ops.selector.doctree and b/.doctrees/data_juicer.ops.selector.doctree differ diff --git a/.doctrees/data_juicer.utils.doctree b/.doctrees/data_juicer.utils.doctree index fb149ce2e..8b363f6fb 100644 Binary files a/.doctrees/data_juicer.utils.doctree and b/.doctrees/data_juicer.utils.doctree differ diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle index 77795d23e..5288b3f45 100644 Binary files a/.doctrees/environment.pickle and b/.doctrees/environment.pickle differ diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html index 9f228513d..f06990850 100644 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ b/_modules/data_juicer/analysis/column_wise_analysis.html @@ -124,7 +124,7 @@

Source code for data_juicer.analysis.column_wise_analysis

[docs]class ColumnWiseAnalysis: """Apply analysis on each column of stats respectively.""" - def __init__(self, +
[docs] def __init__(self, dataset, output_path, overall_result=None, @@ -148,7 +148,7 @@

Source code for data_juicer.analysis.column_wise_analysis

overall_result = oa.analyse() self.overall_result = overall_result - self.save_stats_in_one_file = save_stats_in_one_file + self.save_stats_in_one_file = save_stats_in_one_file
[docs] def analyse(self, show_percentiles=False, show=False): """ diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html index d6fcd3f5a..c09eae1a7 100644 --- a/_modules/data_juicer/analysis/diversity_analysis.html +++ b/_modules/data_juicer/analysis/diversity_analysis.html @@ -157,7 +157,7 @@

Source code for data_juicer.analysis.diversity_analysis

"""Apply diversity analysis for each sample and get an overall analysis result.""" - def __init__(self, dataset, output_path, lang_or_model='en'): +
[docs] def __init__(self, dataset, output_path, lang_or_model='en'): """Initialization method :param dataset: the dataset to be analysed :param output_path: path to store the analysis results :param lang_or_model: the diversity model or a specific language used to load @@ -167,7 +167,7 @@

Source code for data_juicer.analysis.diversity_analysis

self.output_path = output_path if not os.path.exists(self.output_path): os.makedirs(self.output_path) - self.lang_or_model = lang_or_model + self.lang_or_model = lang_or_model
[docs] def compute(self, lang_or_model=None, column_name='text'): """ diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html index 72b9739d1..a0c380a29 100644 --- a/_modules/data_juicer/analysis/overall_analysis.html +++ b/_modules/data_juicer/analysis/overall_analysis.html @@ -78,7 +78,7 @@

Source code for data_juicer.analysis.overall_analysis

"""Apply analysis on the overall stats, including mean, std, quantiles, etc.""" - def __init__(self, dataset, output_path): +
[docs] def __init__(self, dataset, output_path): """ Initialization method. @@ -91,7 +91,7 @@

Source code for data_juicer.analysis.overall_analysis

os.makedirs(self.output_path) # default percentiles to analyse - self.default_percentiles = [0.25, 0.5, 0.75] + self.default_percentiles = [0.25, 0.5, 0.75]
[docs] def analyse(self, percentiles=[]): """ diff --git a/_modules/data_juicer/core/analyser.html b/_modules/data_juicer/core/analyser.html index 127207bd9..6c7bd3ecb 100644 --- a/_modules/data_juicer/core/analyser.html +++ b/_modules/data_juicer/core/analyser.html @@ -94,7 +94,7 @@

Source code for data_juicer.core.analyser

     dataset better.
     """
 
-    def __init__(self, cfg=None):
+
[docs] def __init__(self, cfg=None): """ Initialization method. @@ -131,7 +131,7 @@

Source code for data_juicer.core.analyser

         # parsed_res
         self.overall_result = None
         self.overall_single_plot_path = None
-        self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
+        self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
[docs] def run(self, load_data_np=None): """ diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html index 045c9b4f6..5a9fa91e6 100644 --- a/_modules/data_juicer/core/data.html +++ b/_modules/data_juicer/core/data.html @@ -144,7 +144,7 @@

Source code for data_juicer.core.data

 
[docs]class NestedQueryDict(dict): """Enhanced dict for better usability.""" - def __init__(self, *args, **kargs): +
[docs] def __init__(self, *args, **kargs): if len(args) == 1 and isinstance(args[0], Dataset): # init from another DatasetDict instance self.__dict__ = copy.copy(args[0].__dict__) @@ -155,7 +155,7 @@

Source code for data_juicer.core.data

         # batched sample, (k & v) are organized by list manner
         for k, v in self.items():
             if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
-                self[k] = [NestedQueryDict(item) for item in v]
+                self[k] = [NestedQueryDict(item) for item in v]
def __getitem__(self, key): return nested_query(self, key)
@@ -164,13 +164,13 @@

Source code for data_juicer.core.data

 
[docs]class NestedDatasetDict(DatasetDict): """Enhanced HuggingFace-DatasetDict for better usability and efficiency.""" - def __init__(self, *args, **kargs): +
[docs] def __init__(self, *args, **kargs): if len(args) == 1 and isinstance(args[0], Dataset): # init from another DatasetDict instance self.__dict__ = copy.copy(args[0].__dict__) else: # init from scratch - super().__init__(*args, **kargs) + super().__init__(*args, **kargs)
def __getitem__(self, key): return nested_query(self, key) @@ -189,7 +189,7 @@

Source code for data_juicer.core.data

 
[docs]class NestedDataset(Dataset): """Enhanced HuggingFace-Dataset for better usability and efficiency.""" - def __init__(self, *args, **kargs): +
[docs] def __init__(self, *args, **kargs): if len(args) == 1 and isinstance(args[0], Dataset): # init from another Dataset instance self.__dict__ = copy.copy(args[0].__dict__) @@ -197,7 +197,7 @@

Source code for data_juicer.core.data

             # init from scratch
             super().__init__(*args, **kargs)
 
-        self.need_to_cleanup_caches = not is_caching_enabled()
+        self.need_to_cleanup_caches = not is_caching_enabled()
def __getitem__(self, key): if isinstance(key, str): diff --git a/_modules/data_juicer/core/executor.html b/_modules/data_juicer/core/executor.html index 14d3ed34e..de867852d 100644 --- a/_modules/data_juicer/core/executor.html +++ b/_modules/data_juicer/core/executor.html @@ -94,7 +94,7 @@

Source code for data_juicer.core.executor

     ops in the config file in order and generate a processed dataset.
     """
 
-    def __init__(self, cfg=None):
+
[docs] def __init__(self, cfg=None): """ Initialization method. @@ -149,7 +149,7 @@

Source code for data_juicer.core.executor

             self.op_list_to_trace = self.cfg.op_list_to_trace
             if len(self.cfg.op_list_to_trace) == 0:
                 logger.info('Trace for all ops.')
-                self.op_list_to_trace = set(OPERATORS.modules.keys())
+                self.op_list_to_trace = set(OPERATORS.modules.keys())
[docs] def run(self, load_data_np=None): """ diff --git a/_modules/data_juicer/core/exporter.html b/_modules/data_juicer/core/exporter.html index db8947990..84cbf9dc8 100644 --- a/_modules/data_juicer/core/exporter.html +++ b/_modules/data_juicer/core/exporter.html @@ -86,7 +86,7 @@

Source code for data_juicer.core.exporter

     GiB = 2**30  # 1024*1024*1024
     TiB = 2**40  # 1024*1024*1024*1024
 
-    def __init__(self,
+
[docs] def __init__(self, export_path, export_shard_size=0, export_in_parallel=True, @@ -139,7 +139,7 @@

Source code for data_juicer.core.exporter

             logger.warning(f'The export_shard_size [{self.max_shard_size_str}]'
                            f' is larger than 1TiB. It might generate large '
                            f'single shard file and make loading and exporting '
-                           f'slower.')
+                           f'slower.')
def _get_suffix(self, export_path): """ diff --git a/_modules/data_juicer/core/ray_executor.html b/_modules/data_juicer/core/ray_executor.html index 8abcea806..805dba302 100644 --- a/_modules/data_juicer/core/ray_executor.html +++ b/_modules/data_juicer/core/ray_executor.html @@ -90,7 +90,7 @@

Source code for data_juicer.core.ray_executor

2. Advanced functions such as checkpoint, tracer are not supported. """ - def __init__(self, cfg=None): +
[docs] def __init__(self, cfg=None): """ Initialization method. @@ -104,7 +104,7 @@

Source code for data_juicer.core.ray_executor

# init ray logger.info('Initing Ray ...') ray.init(self.cfg.ray_address) - self.process_list = self.cfg.process + self.process_list = self.cfg.process
[docs] def run(self, load_data_np=None): diff --git a/_modules/data_juicer/core/tracer.html b/_modules/data_juicer/core/tracer.html index 555d13cfa..6c11165d0 100644 --- a/_modules/data_juicer/core/tracer.html +++ b/_modules/data_juicer/core/tracer.html @@ -84,7 +84,7 @@

Source code for data_juicer.core.tracer

     The comparison results will be stored in the work directory.
     """
 
-    def __init__(self, work_dir, show_num=10):
+
[docs] def __init__(self, work_dir, show_num=10): """ Initialization method. @@ -96,7 +96,7 @@

Source code for data_juicer.core.tracer

         self.work_dir = os.path.join(work_dir, 'trace')
         if not os.path.exists(self.work_dir):
             os.makedirs(self.work_dir)
-        self.show_num = show_num
+        self.show_num = show_num
[docs] def trace_mapper(self, op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str): diff --git a/_modules/data_juicer/format/csv_formatter.html b/_modules/data_juicer/format/csv_formatter.html index 338d1d688..c913fd93b 100644 --- a/_modules/data_juicer/format/csv_formatter.html +++ b/_modules/data_juicer/format/csv_formatter.html @@ -81,7 +81,7 @@

Source code for data_juicer.format.csv_formatter

""" SUFFIXES = ['.csv'] - def __init__(self, dataset_path, suffixes=None, **kwargs): +
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): """ Initialization method. @@ -94,7 +94,7 @@

Source code for data_juicer.format.csv_formatter

suffixes=suffixes if suffixes else self.SUFFIXES, type='csv', **kwargs, - )
+ )
diff --git a/_modules/data_juicer/format/formatter.html b/_modules/data_juicer/format/formatter.html index 85379962d..d5f72b398 100644 --- a/_modules/data_juicer/format/formatter.html +++ b/_modules/data_juicer/format/formatter.html @@ -94,7 +94,7 @@

Source code for data_juicer.format.formatter

    """The class is used to load a dataset from local files or local
     directory."""
 
-    def __init__(
+
[docs] def __init__( self, dataset_path: str, type: str, @@ -120,7 +120,7 @@

Source code for data_juicer.format.formatter

self.kwargs = kwargs
         self.text_keys = text_keys
         self.data_files = find_files_with_suffix(dataset_path, suffixes)
-        self.add_suffix = add_suffix
+        self.add_suffix = add_suffix
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset: """ @@ -155,7 +155,7 @@

Source code for data_juicer.format.formatter

    """The class is used to load a dataset from repository of huggingface
     hub."""
 
-    def __init__(self,
+
[docs] def __init__(self, dataset_path: str, text_keys: List[str] = None, **kwargs): @@ -169,7 +169,7 @@

Source code for data_juicer.format.formatter

        """
         self.path = dataset_path
         self.text_keys = text_keys
-        self.kwargs = kwargs
+        self.kwargs = kwargs
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset: """ diff --git a/_modules/data_juicer/format/json_formatter.html b/_modules/data_juicer/format/json_formatter.html index adae4804b..5e2817e66 100644 --- a/_modules/data_juicer/format/json_formatter.html +++ b/_modules/data_juicer/format/json_formatter.html @@ -81,7 +81,7 @@

Source code for data_juicer.format.json_formatter

""" SUFFIXES = ['.json', '.jsonl', '.jsonl.zst'] - def __init__(self, dataset_path, suffixes=None, **kwargs): +
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): """ Initialization method. @@ -94,7 +94,7 @@

Source code for data_juicer.format.json_formatter

suffixes=suffixes if suffixes else self.SUFFIXES, type='json', **kwargs, - )
+ )
diff --git a/_modules/data_juicer/format/mixture_formatter.html b/_modules/data_juicer/format/mixture_formatter.html index f38cbbd1a..003f8e53a 100644 --- a/_modules/data_juicer/format/mixture_formatter.html +++ b/_modules/data_juicer/format/mixture_formatter.html @@ -83,7 +83,7 @@

Source code for data_juicer.format.mixture_formatter

every dataset and merging them, and then exports the merged datasset as a new mixed dataset.""" - def __init__(self, +
[docs] def __init__(self, dataset_path: str, suffixes: Union[str, List[str], Tuple[str]] = None, text_keys=None, @@ -109,7 +109,7 @@

Source code for data_juicer.format.mixture_formatter

text_keys=text_keys, add_suffix=add_suffix, **kwargs) for data_prefix in data_prefixes - ] + ]
def _get_weight(self, data_prefix): """ diff --git a/_modules/data_juicer/format/parquet_formatter.html b/_modules/data_juicer/format/parquet_formatter.html index c7608e2da..8af8186f7 100644 --- a/_modules/data_juicer/format/parquet_formatter.html +++ b/_modules/data_juicer/format/parquet_formatter.html @@ -81,7 +81,7 @@

Source code for data_juicer.format.parquet_formatter

""" SUFFIXES = ['.parquet'] - def __init__(self, dataset_path, suffixes=None, **kwargs): +
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): """ Initialization method. @@ -94,7 +94,7 @@

Source code for data_juicer.format.parquet_formatter

suffixes=suffixes if suffixes else self.SUFFIXES, type='parquet', **kwargs, - )
+ )
diff --git a/_modules/data_juicer/format/text_formatter.html b/_modules/data_juicer/format/text_formatter.html index 4140a5925..28fffda7f 100644 --- a/_modules/data_juicer/format/text_formatter.html +++ b/_modules/data_juicer/format/text_formatter.html @@ -143,7 +143,7 @@

Source code for data_juicer.format.text_formatter

'.m', '.smali' ] - def __init__(self, +
[docs] def __init__(self, dataset_path, suffixes=None, add_suffix=False, @@ -165,7 +165,7 @@

Source code for data_juicer.format.text_formatter

**kwargs, ) self.dataset_path = dataset_path - self.add_suffix = add_suffix + self.add_suffix = add_suffix
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset: """ diff --git a/_modules/data_juicer/format/tsv_formatter.html b/_modules/data_juicer/format/tsv_formatter.html index 7e52a8fa5..c37226a65 100644 --- a/_modules/data_juicer/format/tsv_formatter.html +++ b/_modules/data_juicer/format/tsv_formatter.html @@ -81,7 +81,7 @@

Source code for data_juicer.format.tsv_formatter

""" SUFFIXES = ['.tsv'] - def __init__(self, dataset_path, suffixes=None, **kwargs): +
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): """ Initialization method. @@ -95,7 +95,7 @@

Source code for data_juicer.format.tsv_formatter

type='csv', delimiter='\t', **kwargs, - )
+ )
diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html index c42d12535..ec4494caa 100644 --- a/_modules/data_juicer/ops/base_op.html +++ b/_modules/data_juicer/ops/base_op.html @@ -76,7 +76,7 @@

Source code for data_juicer.ops.base_op

 
 
[docs]class Mapper: - def __init__(self, text_key: str = None): +
[docs] def __init__(self, text_key: str = None): """ Base class that conducts text editing. @@ -90,7 +90,7 @@

Source code for data_juicer.ops.base_op

         self.process = wrap_func_with_nested_access(self.process)
 
         # In default, it's a normal OP instead of batched OP
-        self._batched_op = False
+        self._batched_op = False
[docs] def process(self, sample): """ @@ -107,7 +107,7 @@

Source code for data_juicer.ops.base_op

 
 
[docs]class Filter: - def __init__(self, text_key: str = None): +
[docs] def __init__(self, text_key: str = None): """ Base class that removes specific info. @@ -119,7 +119,7 @@

Source code for data_juicer.ops.base_op

         self.text_key = text_key
         from data_juicer.core.data import wrap_func_with_nested_access
         self.process = wrap_func_with_nested_access(self.process)
-        self.compute_stats = wrap_func_with_nested_access(self.compute_stats)
+        self.compute_stats = wrap_func_with_nested_access(self.compute_stats)
[docs] def compute_stats(self, sample, context=False): """ @@ -145,7 +145,7 @@

Source code for data_juicer.ops.base_op

 
 
[docs]class Deduplicator: - def __init__(self, text_key: str = None): +
[docs] def __init__(self, text_key: str = None): """ Base class that conducts deduplication. @@ -157,7 +157,7 @@

Source code for data_juicer.ops.base_op

         self.text_key = text_key
         from data_juicer.core.data import wrap_func_with_nested_access
         self.process = wrap_func_with_nested_access(self.process)
-        self.compute_hash = wrap_func_with_nested_access(self.compute_hash)
+        self.compute_hash = wrap_func_with_nested_access(self.compute_hash)
[docs] def compute_hash(self, sample): """ @@ -182,7 +182,7 @@

Source code for data_juicer.ops.base_op

 
 
[docs]class Selector: - def __init__(self, text_key: str = None): +
[docs] def __init__(self, text_key: str = None): """ Base class that conducts selection in dataset-level. @@ -193,7 +193,7 @@

Source code for data_juicer.ops.base_op

             text_key = 'text'
         self.text_key = text_key
         from data_juicer.core.data import wrap_func_with_nested_access
-        self.process = wrap_func_with_nested_access(self.process)
+        self.process = wrap_func_with_nested_access(self.process)
[docs] def process(self, dataset): """ diff --git a/_modules/data_juicer/ops/common/helper_func.html b/_modules/data_juicer/ops/common/helper_func.html index f1b319fbe..865499308 100644 --- a/_modules/data_juicer/ops/common/helper_func.html +++ b/_modules/data_juicer/ops/common/helper_func.html @@ -79,9 +79,9 @@

Source code for data_juicer.ops.common.helper_func

[docs]class UnionFind: - def __init__(self): +
[docs] def __init__(self): """Initialization method.""" - self.parent: Dict[int, int] = {} + self.parent: Dict[int, int] = {}
[docs] def find(self, x): if x not in self.parent: diff --git a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html index 8596040c7..6a619f12f 100644 --- a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html @@ -93,7 +93,7 @@

Source code for data_juicer.ops.deduplicator.document_deduplicator

Using md5 hash to deduplicate samples. """ - def __init__(self, +
[docs] def __init__(self, lowercase: bool = False, ignore_non_character: bool = False, *args, @@ -111,7 +111,7 @@

Source code for data_juicer.ops.deduplicator.document_deduplicator

self.lowercase = lowercase self.remove_non_character_regex = re.compile( f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605 - ) if ignore_non_character else None + ) if ignore_non_character else None
[docs] def compute_hash(self, sample): """ diff --git a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html index 01ea211d4..72dfde50c 100644 --- a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html @@ -169,7 +169,7 @@

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator kept in the final dataset. """ - def __init__( +
[docs] def __init__( self, tokenization: str = 'space', window_size: PositiveInt = 5, @@ -251,7 +251,7 @@

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicatorgen.randint(0, MERSENNE_PRIME, dtype=np.uint64), ) for _ in range(self.num_permutation)], dtype=np.uint64, - ).T + ).T

[docs] def compute_hash(self, sample): """ diff --git a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html index 85206bbe8..d393a4ba7 100644 --- a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html @@ -135,7 +135,7 @@

Source code for data_juicer.ops.deduplicator.document_simhash_deduplicatorclass DocumentSimhashDeduplicator(Deduplicator): """Deduplicator to deduplicate samples at document-level using SimHash.""" - def __init__(self, +
[docs] def __init__(self, tokenization: str = 'space', window_size: PositiveInt = 6, lowercase: bool = True, @@ -181,7 +181,7 @@

Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator# about deduplication self.num_blocks = num_blocks - self.hamming_distance = hamming_distance + self.hamming_distance = hamming_distance

[docs] def compute_hash(self, sample): """ diff --git a/_modules/data_juicer/ops/filter/alphanumeric_filter.html b/_modules/data_juicer/ops/filter/alphanumeric_filter.html index 550b89c6f..42150f84c 100644 --- a/_modules/data_juicer/ops/filter/alphanumeric_filter.html +++ b/_modules/data_juicer/ops/filter/alphanumeric_filter.html @@ -85,7 +85,7 @@

Source code for data_juicer.ops.filter.alphanumeric_filter

"""Filter to keep samples with alphabet/numeric ratio within a specific range.""" - def __init__(self, +
[docs] def __init__(self, tokenization: bool = False, min_ratio: float = 0.25, max_ratio: PositiveFloat = sys.maxsize, @@ -116,7 +116,7 @@

Source code for data_juicer.ops.filter.alphanumeric_filter

if tokenization: self.model_key = prepare_model( model_type='huggingface', - model_key='EleutherAI/pythia-6.9b-deduped') + model_key='EleutherAI/pythia-6.9b-deduped')
[docs] def compute_stats(self, sample): if self.tokenization: diff --git a/_modules/data_juicer/ops/filter/average_line_length_filter.html b/_modules/data_juicer/ops/filter/average_line_length_filter.html index a224804e8..4d58914b2 100644 --- a/_modules/data_juicer/ops/filter/average_line_length_filter.html +++ b/_modules/data_juicer/ops/filter/average_line_length_filter.html @@ -85,7 +85,7 @@

Source code for data_juicer.ops.filter.average_line_length_filter

"""Filter to keep samples with average line length within a specific range.""" - def __init__(self, +
[docs] def __init__(self, min_len: PositiveInt = 10, max_len: PositiveInt = sys.maxsize, *args, @@ -104,7 +104,7 @@

Source code for data_juicer.ops.filter.average_line_length_filter

""" super().__init__(*args, **kwargs) self.min_len = min_len - self.max_len = max_len + self.max_len = max_len
[docs] def compute_stats(self, sample, context=False): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/character_repetition_filter.html b/_modules/data_juicer/ops/filter/character_repetition_filter.html index db03058bc..40e1b1c3f 100644 --- a/_modules/data_juicer/ops/filter/character_repetition_filter.html +++ b/_modules/data_juicer/ops/filter/character_repetition_filter.html @@ -86,7 +86,7 @@

Source code for data_juicer.ops.filter.character_repetition_filter

"""Filter to keep samples with char-level n-gram repetition ratio within a \ specific range.""" - def __init__(self, +
[docs] def __init__(self, rep_len: PositiveInt = 10, min_ratio: ClosedUnitInterval = 0.0, max_ratio: ClosedUnitInterval = 0.5, @@ -108,7 +108,7 @@

Source code for data_juicer.ops.filter.character_repetition_filter

super().__init__(*args, **kwargs) self.n = rep_len self.min_ratio = min_ratio - self.max_ratio = max_ratio + self.max_ratio = max_ratio
[docs] def compute_stats(self, sample): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/flagged_words_filter.html b/_modules/data_juicer/ops/filter/flagged_words_filter.html index 39bce2dc8..8be147f2a 100644 --- a/_modules/data_juicer/ops/filter/flagged_words_filter.html +++ b/_modules/data_juicer/ops/filter/flagged_words_filter.html @@ -91,7 +91,7 @@

Source code for data_juicer.ops.filter.flagged_words_filter

"""Filter to keep samples with flagged-word ratio less than a specific max value.""" - def __init__(self, +
[docs] def __init__(self, lang: str = 'en', tokenization: bool = False, max_ratio: ClosedUnitInterval = 0.045, @@ -138,7 +138,7 @@

Source code for data_juicer.ops.filter.flagged_words_filter

] if tokenization: self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') + model_type='sentencepiece')
[docs] def compute_stats(self, sample, context=False): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/language_id_score_filter.html b/_modules/data_juicer/ops/filter/language_id_score_filter.html index ba90853b8..0cbcc1d89 100644 --- a/_modules/data_juicer/ops/filter/language_id_score_filter.html +++ b/_modules/data_juicer/ops/filter/language_id_score_filter.html @@ -83,7 +83,7 @@

Source code for data_juicer.ops.filter.language_id_score_filter

"""Filter to keep samples in a specific language with confidence score larger than a specific min value.""" - def __init__(self, +
[docs] def __init__(self, lang: str = '', min_score: ClosedUnitInterval = 0.8, *args, @@ -100,7 +100,7 @@

Source code for data_juicer.ops.filter.language_id_score_filter

super().__init__(*args, **kwargs) self.lang = lang self.min_score = min_score - self.model_key = prepare_model(lang=lang, model_type='fasttext') + self.model_key = prepare_model(lang=lang, model_type='fasttext')
[docs] def compute_stats(self, sample): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html b/_modules/data_juicer/ops/filter/maximum_line_length_filter.html index ba53aea86..f8977679c 100644 --- a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html +++ b/_modules/data_juicer/ops/filter/maximum_line_length_filter.html @@ -85,7 +85,7 @@

Source code for data_juicer.ops.filter.maximum_line_length_filter

"""Filter to keep samples with maximum line length within a specific range.""" - def __init__(self, +
[docs] def __init__(self, min_len: PositiveInt = 10, max_len: PositiveInt = sys.maxsize, *args, @@ -104,7 +104,7 @@

Source code for data_juicer.ops.filter.maximum_line_length_filter

""" super().__init__(*args, **kwargs) self.min_len = min_len - self.max_len = max_len + self.max_len = max_len
[docs] def compute_stats(self, sample, context=False): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/perplexity_filter.html b/_modules/data_juicer/ops/filter/perplexity_filter.html index ebaa84b09..e91dfdeab 100644 --- a/_modules/data_juicer/ops/filter/perplexity_filter.html +++ b/_modules/data_juicer/ops/filter/perplexity_filter.html @@ -89,7 +89,7 @@

Source code for data_juicer.ops.filter.perplexity_filter

"""Filter to keep samples with perplexity score less than a specific max value.""" - def __init__(self, +
[docs] def __init__(self, lang: str = 'en', max_ppl: PositiveFloat = 1500, *args, @@ -108,7 +108,7 @@

Source code for data_juicer.ops.filter.perplexity_filter

self.lang = lang self.sp_model_key = prepare_model(lang=lang, model_type='sentencepiece') - self.kl_model_key = prepare_model(lang=lang, model_type='kenlm') + self.kl_model_key = prepare_model(lang=lang, model_type='kenlm')
[docs] def compute_stats(self, sample, context=False): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/special_characters_filter.html b/_modules/data_juicer/ops/filter/special_characters_filter.html index 002b6d2c7..b88e299f8 100644 --- a/_modules/data_juicer/ops/filter/special_characters_filter.html +++ b/_modules/data_juicer/ops/filter/special_characters_filter.html @@ -86,7 +86,7 @@

Source code for data_juicer.ops.filter.special_characters_filter

"""Filter to keep samples with special-char ratio within a specific range.""" - def __init__(self, +
[docs] def __init__(self, min_ratio: ClosedUnitInterval = 0.0, max_ratio: ClosedUnitInterval = 0.25, *args, @@ -105,7 +105,7 @@

Source code for data_juicer.ops.filter.special_characters_filter

""" super().__init__(*args, **kwargs) self.min_ratio = min_ratio - self.max_ratio = max_ratio + self.max_ratio = max_ratio
[docs] def compute_stats(self, sample): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/specified_field_filter.html b/_modules/data_juicer/ops/filter/specified_field_filter.html index b8099599c..cfa325292 100644 --- a/_modules/data_juicer/ops/filter/specified_field_filter.html +++ b/_modules/data_juicer/ops/filter/specified_field_filter.html @@ -83,7 +83,7 @@

Source code for data_juicer.ops.filter.specified_field_filter

specified target value, the sample will be filtered. """ - def __init__(self, +
[docs] def __init__(self, field_key: str = '', target_value: Union[List, Tuple] = [], *args, @@ -102,7 +102,7 @@

Source code for data_juicer.ops.filter.specified_field_filter

""" super().__init__(*args, **kwargs) self.field_key = field_key - self.target_value = target_value + self.target_value = target_value
[docs] def compute_stats(self, sample): return sample
diff --git a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html b/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html index 20723d1d8..d7969cb19 100644 --- a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html +++ b/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html @@ -93,7 +93,7 @@

Source code for data_juicer.ops.filter.specified_numeric_field_filter

specified range, the sample will be filtered. """ - def __init__(self, +
[docs] def __init__(self, field_key: str = '', min_value: float = -sys.maxsize, max_value: float = sys.maxsize, @@ -118,7 +118,7 @@

Source code for data_juicer.ops.filter.specified_numeric_field_filter

super().__init__(*args, **kwargs) self.field_key = field_key self.min_value = min_value - self.max_value = max_value + self.max_value = max_value
[docs] def compute_stats(self, sample): return sample
diff --git a/_modules/data_juicer/ops/filter/stopwords_filter.html b/_modules/data_juicer/ops/filter/stopwords_filter.html index 405113a62..30773510a 100644 --- a/_modules/data_juicer/ops/filter/stopwords_filter.html +++ b/_modules/data_juicer/ops/filter/stopwords_filter.html @@ -91,7 +91,7 @@

Source code for data_juicer.ops.filter.stopwords_filter

"""Filter to keep samples with stopword ratio larger than a specific min value.""" - def __init__(self, +
[docs] def __init__(self, lang: str = 'en', tokenization: bool = False, min_ratio: ClosedUnitInterval = 0.3, @@ -136,7 +136,7 @@

Source code for data_juicer.ops.filter.stopwords_filter

] if tokenization: self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') + model_type='sentencepiece')
[docs] def compute_stats(self, sample, context=False): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/suffix_filter.html b/_modules/data_juicer/ops/filter/suffix_filter.html index 0925d4988..43f502e27 100644 --- a/_modules/data_juicer/ops/filter/suffix_filter.html +++ b/_modules/data_juicer/ops/filter/suffix_filter.html @@ -80,7 +80,7 @@

Source code for data_juicer.ops.filter.suffix_filter

class SuffixFilter(Filter): """Filter to keep samples with specified suffix.""" - def __init__(self, +
[docs] def __init__(self, suffixes: Union[str, List[str], Tuple[str]] = [], *args, **kwargs): @@ -98,7 +98,7 @@

Source code for data_juicer.ops.filter.suffix_filter

elif isinstance(suffixes, str): self.suffixes = [suffixes] else: - self.suffixes = suffixes + self.suffixes = suffixes
[docs] def compute_stats(self, sample): return sample
diff --git a/_modules/data_juicer/ops/filter/text_length_filter.html b/_modules/data_juicer/ops/filter/text_length_filter.html index 936f4ccf5..7ed079cd8 100644 --- a/_modules/data_juicer/ops/filter/text_length_filter.html +++ b/_modules/data_juicer/ops/filter/text_length_filter.html @@ -83,7 +83,7 @@

Source code for data_juicer.ops.filter.text_length_filter

"""Filter to keep samples with total text length within a specific range.""" - def __init__(self, +
[docs] def __init__(self, min_len: PositiveInt = 10, max_len: PositiveInt = sys.maxsize, *args, @@ -102,7 +102,7 @@

Source code for data_juicer.ops.filter.text_length_filter

""" super().__init__(*args, **kwargs) self.min_len = min_len - self.max_len = max_len + self.max_len = max_len
[docs] def compute_stats(self, sample): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/token_num_filter.html b/_modules/data_juicer/ops/filter/token_num_filter.html index 5cba586b1..49a7af726 100644 --- a/_modules/data_juicer/ops/filter/token_num_filter.html +++ b/_modules/data_juicer/ops/filter/token_num_filter.html @@ -85,7 +85,7 @@

Source code for data_juicer.ops.filter.token_num_filter

"""Filter to keep samples with total token number within a specific range.""" - def __init__(self, +
[docs] def __init__(self, hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: PositiveInt = 10, max_num: PositiveInt = sys.maxsize, @@ -109,7 +109,7 @@

Source code for data_juicer.ops.filter.token_num_filter

self.max_num = max_num self.hf_tokenizer = hf_tokenizer self.model_key = prepare_model(model_type='huggingface', - model_key=hf_tokenizer) + model_key=hf_tokenizer)
[docs] def compute_stats(self, sample): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/word_num_filter.html b/_modules/data_juicer/ops/filter/word_num_filter.html index b47df1c8a..04e5f048d 100644 --- a/_modules/data_juicer/ops/filter/word_num_filter.html +++ b/_modules/data_juicer/ops/filter/word_num_filter.html @@ -88,7 +88,7 @@

Source code for data_juicer.ops.filter.word_num_filter

"""Filter to keep samples with total words number within a specific range.""" - def __init__(self, +
[docs] def __init__(self, lang: str = 'en', tokenization: bool = False, min_num: PositiveInt = 10, @@ -117,7 +117,7 @@

Source code for data_juicer.ops.filter.word_num_filter

if tokenization: self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') + model_type='sentencepiece')
[docs] def compute_stats(self, sample, context=False): # check if it's computed already diff --git a/_modules/data_juicer/ops/filter/word_repetition_filter.html b/_modules/data_juicer/ops/filter/word_repetition_filter.html index 0b789d646..5ecdcedab 100644 --- a/_modules/data_juicer/ops/filter/word_repetition_filter.html +++ b/_modules/data_juicer/ops/filter/word_repetition_filter.html @@ -90,7 +90,7 @@

Source code for data_juicer.ops.filter.word_repetition_filter

"""Filter to keep samples with word-level n-gram repetition ratio within a \ specific range.""" - def __init__(self, +
[docs] def __init__(self, lang: str = 'en', tokenization: bool = False, rep_len: PositiveInt = 10, @@ -122,7 +122,7 @@

Source code for data_juicer.ops.filter.word_repetition_filter

if tokenization: self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') + model_type='sentencepiece')
[docs] def compute_stats(self, sample, context=False): # check if it's computed already diff --git a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html b/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html index 5b2467929..30e3a5c79 100644 --- a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html @@ -83,7 +83,7 @@

Source code for data_juicer.ops.mapper.clean_copyright_mapper

"""Mapper to clean copyright comments at the beginning of the text samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. @@ -92,7 +92,7 @@

Source code for data_juicer.ops.mapper.clean_copyright_mapper

""" super().__init__(*args, **kwargs) self.pat = re.compile('/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/') - self.cpat = re.compile('copyright', re.IGNORECASE) + self.cpat = re.compile('copyright', re.IGNORECASE)
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/clean_email_mapper.html b/_modules/data_juicer/ops/mapper/clean_email_mapper.html index 518d07904..1c8ef5726 100644 --- a/_modules/data_juicer/ops/mapper/clean_email_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_email_mapper.html @@ -78,7 +78,7 @@

Source code for data_juicer.ops.mapper.clean_email_mapper

class CleanEmailMapper(Mapper): """Mapper to clean email in text samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. @@ -86,7 +86,7 @@

Source code for data_juicer.ops.mapper.clean_email_mapper

:param kwargs: extra args """ super().__init__(*args, **kwargs) - self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+' + self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+'
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/clean_html_mapper.html b/_modules/data_juicer/ops/mapper/clean_html_mapper.html index 6805312be..1de758f20 100644 --- a/_modules/data_juicer/ops/mapper/clean_html_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_html_mapper.html @@ -82,14 +82,14 @@

Source code for data_juicer.ops.mapper.clean_html_mapper

class CleanHtmlMapper(Mapper): """Mapper to clean html code in text samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ - super().__init__(*args, **kwargs) + super().__init__(*args, **kwargs)
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html b/_modules/data_juicer/ops/mapper/clean_ip_mapper.html index a78962f15..5c1afded5 100644 --- a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_ip_mapper.html @@ -78,7 +78,7 @@

Source code for data_juicer.ops.mapper.clean_ip_mapper

class CleanIpMapper(Mapper): """Mapper to clean ipv4 and ipv6 address in text samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. @@ -91,7 +91,7 @@

Source code for data_juicer.ops.mapper.clean_ip_mapper

self.pattern += r'(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))' self.pattern += r'{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|' self.pattern += r'(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|' - self.pattern += r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}' # ipv6 + self.pattern += r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}' # ipv6
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/clean_links_mapper.html b/_modules/data_juicer/ops/mapper/clean_links_mapper.html index 48476125d..a7cd5811b 100644 --- a/_modules/data_juicer/ops/mapper/clean_links_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_links_mapper.html @@ -81,7 +81,7 @@

Source code for data_juicer.ops.mapper.clean_links_mapper

class CleanLinksMapper(Mapper): """Mapper to clean links like http/https/ftp in text samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. @@ -96,7 +96,7 @@

Source code for data_juicer.ops.mapper.clean_links_mapper

self.pattern += r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))' self.pattern += r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' self.pattern += r'[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])' - self.pattern += r')' + self.pattern += r')'
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html b/_modules/data_juicer/ops/mapper/expand_macro_mapper.html index 91b5dc91e..2d11c165b 100644 --- a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html +++ b/_modules/data_juicer/ops/mapper/expand_macro_mapper.html @@ -83,14 +83,14 @@

Source code for data_juicer.ops.mapper.expand_macro_mapper

"""Mapper to expand macro definitions in the document body of Latex samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ - super().__init__(*args, **kwargs) + super().__init__(*args, **kwargs)
def _build_non_arg_macros_dict(self, file_content): # regex for extracting \newcommand macros without arguments diff --git a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html b/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html index def7fdfb0..55b4c8923 100644 --- a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html +++ b/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html @@ -78,14 +78,14 @@

Source code for data_juicer.ops.mapper.fix_unicode_mapper

class FixUnicodeMapper(Mapper): """Mapper to fix unicode errors in text samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ - super().__init__(*args, **kwargs) + super().__init__(*args, **kwargs)
[docs] def process(self, sample): sample[self.text_key] = ftfy.fix_text(sample[self.text_key]) diff --git a/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html b/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html index d1dad3ad2..66ef88958 100644 --- a/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html +++ b/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html @@ -85,7 +85,7 @@

Source code for data_juicer.ops.mapper.nlpaug_en_mapper

class NlpaugEnMapper(Mapper): """Mapper to simply augment samples in English based on nlpaug library.""" - def __init__(self, +
[docs] def __init__(self, sequential: bool = False, aug_num: int = 1, delete_random_word: bool = False, @@ -180,7 +180,7 @@

Source code for data_juicer.ops.mapper.nlpaug_en_mapper

if self.sequential: self.aug = naf.Sequential(aug_pipeline) else: - self.aug = aug_pipeline + self.aug = aug_pipeline
[docs] def process(self, samples): # no augmentation methods are opened diff --git a/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html b/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html index 8501832d4..cce5ec9c6 100644 --- a/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html +++ b/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html @@ -81,7 +81,7 @@

Source code for data_juicer.ops.mapper.nlpcda_zh_mapper

class NlpcdaZhMapper(Mapper): """Mapper to simply augment samples in Chinese based on nlpcda library.""" - def __init__(self, +
[docs] def __init__(self, sequential: bool = False, aug_num: int = 1, replace_similar_word: bool = False, @@ -185,7 +185,7 @@

Source code for data_juicer.ops.mapper.nlpcda_zh_mapper

if not self.sequential or len(self.aug_pipeline) == 0 \ else 2 self.aug_pipeline.append( - nlpcda.EquivalentChar(create_num=create_num)) + nlpcda.EquivalentChar(create_num=create_num))
[docs] def process(self, samples): # no augmentation methods are opened diff --git a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html b/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html index 5cb8e8d7d..05b5c7104 100644 --- a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html +++ b/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html @@ -81,7 +81,7 @@

Source code for data_juicer.ops.mapper.punctuation_normalization_mapper

"""Mapper to normalize unicode punctuations to English punctuations in text \ samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. @@ -124,7 +124,7 @@

Source code for data_juicer.ops.mapper.punctuation_normalization_mapper

'】': ']', '%': '%', '►': '-', - } + }
[docs] def process(self, sample): sample[self.text_key] = ''.join([ diff --git a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html b/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html index 6b077694e..03f642ec3 100644 --- a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html @@ -83,7 +83,7 @@

Source code for data_juicer.ops.mapper.remove_bibliography_mapper

"""Mapper to remove bibliography at the end of documents in Latex samples.""" - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. @@ -96,7 +96,7 @@

Source code for data_juicer.ops.mapper.remove_bibliography_mapper

self.pattern += r'\\begin\{REFERENCES\}|' self.pattern += r'\\begin\{thebibliography\}|' self.pattern += r'\\bibliography\{.*\}' - self.pattern += r').*$' + self.pattern += r').*$'
[docs] def process(self, sample): sample[self.text_key] = re.sub(pattern=self.pattern, diff --git a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html b/_modules/data_juicer/ops/mapper/remove_comments_mapper.html index 5eedf088c..33c667184 100644 --- a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_comments_mapper.html @@ -88,7 +88,7 @@

Source code for data_juicer.ops.mapper.remove_comments_mapper

Only support 'tex' \ for now. """ - def __init__(self, +
[docs] def __init__(self, doc_type: Union[str, List[str]] = 'tex', inline: bool = True, multiline: bool = True, @@ -106,7 +106,7 @@

Source code for data_juicer.ops.mapper.remove_comments_mapper

super().__init__(*args, **kwargs) self.doc_type = doc_type self.inline = inline - self.multiline = multiline + self.multiline = multiline
[docs] def process(self, sample): # TODO: remove different comments by sample type diff --git a/_modules/data_juicer/ops/mapper/remove_header_mapper.html b/_modules/data_juicer/ops/mapper/remove_header_mapper.html index 0c9da4f6e..b21c95e44 100644 --- a/_modules/data_juicer/ops/mapper/remove_header_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_header_mapper.html @@ -84,7 +84,7 @@

Source code for data_juicer.ops.mapper.remove_header_mapper

"""Mapper to remove headers at the beginning of documents in Latex samples.""" - def __init__(self, drop_no_head: bool = True, *args, **kwargs): +
[docs] def __init__(self, drop_no_head: bool = True, *args, **kwargs): """ Initialization method. @@ -104,7 +104,7 @@

Source code for data_juicer.ops.mapper.remove_header_mapper

self.pattern += r'\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}' self.pattern += r')' - self.drop_no_head = drop_no_head + self.drop_no_head = drop_no_head
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html b/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html index ff9eb9952..d6b0357ae 100644 --- a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html @@ -86,7 +86,7 @@

Source code for data_juicer.ops.mapper.remove_long_words_mapper

class RemoveLongWordsMapper(Mapper): """Mapper to remove long words within a specific range.""" - def __init__(self, +
[docs] def __init__(self, min_len: PositiveInt = 1, max_len: PositiveInt = sys.maxsize, *args, @@ -103,7 +103,7 @@

Source code for data_juicer.ops.mapper.remove_long_words_mapper

""" super().__init__(*args, **kwargs) self.min_len = min_len - self.max_len = max_len + self.max_len = max_len
[docs] def should_keep_long_word(self, word): if self.min_len <= len(word) <= self.max_len: diff --git a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html b/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html index 8253ed2a8..7e82c551f 100644 --- a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html @@ -80,7 +80,7 @@

Source code for data_juicer.ops.mapper.remove_specific_chars_mapper

class RemoveSpecificCharsMapper(Mapper): """Mapper to clean specific chars in text samples.""" - def __init__(self, +
[docs] def __init__(self, chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs): @@ -97,7 +97,7 @@

Source code for data_juicer.ops.mapper.remove_specific_chars_mapper

if chars_to_remove: self.pattern = '[' + '|'.join(chars_to_remove) + ']' else: - self.pattern = None + self.pattern = None
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html b/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html index 660a0ed3f..a018dc7f6 100644 --- a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html @@ -87,7 +87,7 @@

Source code for data_juicer.ops.mapper.remove_table_text_mapper

number of tables. """ - def __init__(self, +
[docs] def __init__(self, min_col: from_2_to_20 = 2, max_col: from_2_to_20 = 20, *args, @@ -103,7 +103,7 @@

Source code for data_juicer.ops.mapper.remove_table_text_mapper

super().__init__(*args, **kwargs) self.min_col = min_col self.max_col = max_col - self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}' + self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}'
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html b/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html index 9f7557eee..42076b59a 100644 --- a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html @@ -83,7 +83,7 @@

Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substring class RemoveWordsWithIncorrectSubstringsMapper(Mapper): """Mapper to remove words with incorrect substrings.""" - def __init__(self, +
[docs] def __init__(self, lang: str = 'en', tokenization: bool = False, substrings: List = None, @@ -106,7 +106,7 @@

Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substring self.lang = lang if tokenization: self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') + model_type='sentencepiece')

[docs] def should_keep_word_with_incorrect_substrings(self, word, substrings): word = strip(word, SPECIAL_CHARACTERS) diff --git a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html b/_modules/data_juicer/ops/mapper/sentence_split_mapper.html index ed1552d42..07d6522b6 100644 --- a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html +++ b/_modules/data_juicer/ops/mapper/sentence_split_mapper.html @@ -79,7 +79,7 @@

Source code for data_juicer.ops.mapper.sentence_split_mapper

class SentenceSplitMapper(Mapper): """Mapper to split text samples to sentences.""" - def __init__(self, lang: str = 'en', *args, **kwargs): +
[docs] def __init__(self, lang: str = 'en', *args, **kwargs): """ Initialization method. @@ -89,7 +89,7 @@

Source code for data_juicer.ops.mapper.sentence_split_mapper

""" super().__init__(*args, **kwargs) self.lang = lang - self.model_key = prepare_model(lang=lang, model_type='nltk') + self.model_key = prepare_model(lang=lang, model_type='nltk')
[docs] def process(self, sample): diff --git a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html b/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html index ff6535bd1..0ecd132bf 100644 --- a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html +++ b/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html @@ -86,14 +86,14 @@

Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

< https://en.wikipedia.org/wiki/Whitespace_character """ - def __init__(self, *args, **kwargs): +
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ - super().__init__(*args, **kwargs) + super().__init__(*args, **kwargs)
[docs] def process(self, sample): # remove whitespaces before and after the main content diff --git a/_modules/data_juicer/ops/op_fusion.html b/_modules/data_juicer/ops/op_fusion.html index 25922ad8d..78475498d 100644 --- a/_modules/data_juicer/ops/op_fusion.html +++ b/_modules/data_juicer/ops/op_fusion.html @@ -177,14 +177,14 @@

Source code for data_juicer.ops.op_fusion

 
 
[docs]class FusedFilter(Filter): """A fused operator for filters.""" - def __init__(self, fused_filters: List): +
[docs] def __init__(self, fused_filters: List): """ Initialization method. :param fused_filers: a list of filters to be fused. """ super().__init__() - self.fused_filters = fused_filters + self.fused_filters = fused_filters
[docs] def compute_stats(self, sample): # context for the intermediate vars diff --git a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html b/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html index dec801b69..c68e22df8 100644 --- a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html +++ b/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html @@ -81,7 +81,7 @@

Source code for data_juicer.ops.selector.frequency_specified_field_selector< """Selector to select samples based on the sorted frequency of specified field.""" - def __init__(self, +
[docs] def __init__(self, field_key: str = '', top_ratio: ClosedUnitInterval = None, topk: PositiveInt = None, @@ -114,7 +114,7 @@

Source code for data_juicer.ops.selector.frequency_specified_field_selector< self.field_key = field_key self.top_ratio = top_ratio self.topk = topk - self.reverse = reverse + self.reverse = reverse

[docs] def process(self, dataset): if len(dataset) <= 1 or not self.field_key: diff --git a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html index 6ebf81c65..f5302e79a 100644 --- a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html +++ b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html @@ -92,7 +92,7 @@

Source code for data_juicer.ops.selector.topk_specified_field_selector

< """Selector to select top samples based on the sorted specified field value.""" - def __init__(self, +
[docs] def __init__(self, field_key: str = '', top_ratio: ClosedUnitInterval = None, topk: PositiveInt = None, @@ -125,7 +125,7 @@

Source code for data_juicer.ops.selector.topk_specified_field_selector

< self.field_key = field_key self.top_ratio = top_ratio self.topk = topk - self.reverse = reverse + self.reverse = reverse
[docs] def process(self, dataset): if len(dataset) <= 1 or not self.field_key: diff --git a/_modules/data_juicer/utils/ckpt_utils.html b/_modules/data_juicer/utils/ckpt_utils.html index 857d86de7..f8062a475 100644 --- a/_modules/data_juicer/utils/ckpt_utils.html +++ b/_modules/data_juicer/utils/ckpt_utils.html @@ -86,7 +86,7 @@

Source code for data_juicer.utils.ckpt_utils

    rerun from the beginning.
     """
 
-    def __init__(self, ckpt_dir, original_process_list, num_proc=1):
+
[docs] def __init__(self, ckpt_dir, original_process_list, num_proc=1): """ Initialization method. @@ -101,7 +101,7 @@

Source code for data_juicer.utils.ckpt_utils

self.num_proc = num_proc
         self.op_record = []
 
-        self.ckpt_available = self.check_ckpt()
+        self.ckpt_available = self.check_ckpt()
[docs] def get_left_process_list(self): """ diff --git a/_modules/data_juicer/utils/compress.html b/_modules/data_juicer/utils/compress.html index fe83f1afa..70bfaf5d7 100644 --- a/_modules/data_juicer/utils/compress.html +++ b/_modules/data_juicer/utils/compress.html @@ -250,7 +250,7 @@

Source code for data_juicer.utils.compress

     using compression format algorithms.
     """
 
-    def __init__(self, compressor_format: str = 'zstd'):
+
[docs] def __init__(self, compressor_format: str = 'zstd'): """ Initialization method. @@ -261,7 +261,7 @@

Source code for data_juicer.utils.compress

         assert compressor_format in Compressor.compressors.keys()
         self.compressor_format = compressor_format
         self.compressor = Compressor
-        self.extractor = Extractor
+        self.extractor = Extractor
[docs] def compress( self, @@ -295,7 +295,7 @@

Source code for data_juicer.utils.compress

     using compression format algorithms.
     """
 
-    def __init__(self, compressor_format: str = 'zstd'):
+
[docs] def __init__(self, compressor_format: str = 'zstd'): """ Initialization method. @@ -306,7 +306,7 @@

Source code for data_juicer.utils.compress

         self.compressor_extension = '.' + compressor_format
         self.compress_manager = CompressManager(
             compressor_format=compressor_format)
-        self.pattern = re.compile('_\d{5}_of_')  # noqa W605
+        self.pattern = re.compile('_\d{5}_of_')  # noqa W605
def _get_raw_filename(self, filename: Union[Path, str]): """ diff --git a/_modules/data_juicer/utils/fingerprint_utils.html b/_modules/data_juicer/utils/fingerprint_utils.html index 590626228..19b99ed8f 100644 --- a/_modules/data_juicer/utils/fingerprint_utils.html +++ b/_modules/data_juicer/utils/fingerprint_utils.html @@ -86,8 +86,8 @@

Source code for data_juicer.utils.fingerprint_utils

dispatch: Dict = {} - def __init__(self): - self.m = xxhash.xxh64() +
[docs] def __init__(self): + self.m = xxhash.xxh64()
[docs] @classmethod def hash_bytes(cls, value: Union[bytes, List[bytes]]) -> str: diff --git a/_modules/data_juicer/utils/logger_utils.html b/_modules/data_juicer/utils/logger_utils.html index c034586b7..cf2e76dfe 100644 --- a/_modules/data_juicer/utils/logger_utils.html +++ b/_modules/data_juicer/utils/logger_utils.html @@ -114,7 +114,7 @@

Source code for data_juicer.utils.logger_utils

[docs]class StreamToLoguru: """Stream object that redirects writes to a logger instance.""" - def __init__(self, level='INFO', caller_names=('datasets', 'logging')): +

[docs] def __init__(self, level='INFO', caller_names=('datasets', 'logging')): """ Initialization method. @@ -124,7 +124,7 @@

Source code for data_juicer.utils.logger_utils

""" self.level = level self.linebuf = '' - self.caller_names = caller_names + self.caller_names = caller_names

[docs] def write(self, buf): full_name = get_caller_name(depth=1) diff --git a/_modules/data_juicer/utils/registry.html b/_modules/data_juicer/utils/registry.html index 134abaa41..d07b77ab0 100644 --- a/_modules/data_juicer/utils/registry.html +++ b/_modules/data_juicer/utils/registry.html @@ -95,14 +95,14 @@

Source code for data_juicer.utils.registry

     """This class is used to register some modules to registry by a repo
     name."""
 
-    def __init__(self, name: str):
+
[docs] def __init__(self, name: str): """ Initialization method. :param name: a registry repo name """ self._name = name - self._modules = {} + self._modules = {}
@property def name(self): diff --git a/data_juicer.analysis.html b/data_juicer.analysis.html index 6c08d1363..672474713 100644 --- a/data_juicer.analysis.html +++ b/data_juicer.analysis.html @@ -108,6 +108,19 @@

d a t a _ j u i c e r . a n a l y s i sclass data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]

Bases: object

Apply analysis on each column of stats respectively.

+
+
+__init__(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]
+

Initialization method +:param dataset: the dataset to be analysed +:param output_path: path to store the analysis results +:param overall_result: optional precomputed overall stats result +:param save_stats_in_one_file: whether save all analysis figures of all

+
+

stats into one image file

+
+
+
analyse(show_percentiles=False, show=False)[source]
@@ -201,6 +214,15 @@

d a t a _ j u i c e r . a n a l y s i sobject

Apply diversity analysis for each sample and get an overall analysis result.

+
+
+__init__(dataset, output_path, lang_or_model='en')[source]
+

Initialization method :param dataset: the dataset to be analysed +:param output_path: path to store the analysis results :param +lang_or_model: the diversity model or a specific language used to load +the diversity model.

+
+
analyse(lang_or_model=None, column_name='text', postproc_func=<function get_diversity>, **postproc_kwarg)[source]
@@ -307,6 +329,20 @@

d a t a _ j u i c e r . a n a l y s i sobject

Apply analysis on the overall stats, including mean, std, quantiles, etc.

+
+
+__init__(dataset, output_path)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset – the dataset to be analysed

  • +
  • output_path – path to store the analysis results.

  • +
+
+
+
+
analyse(percentiles=[])[source]
diff --git a/data_juicer.core.html b/data_juicer.core.html index 0b560b21e..3f3c1c769 100644 --- a/data_juicer.core.html +++ b/data_juicer.core.html @@ -126,6 +126,17 @@

d a t a _ j u i c e r . c o r e +
+__init__(cfg=None)[source]
+

Initialization method.

+
+
Parameters:
+

cfg – optional config dict.

+
+
+

+
run(load_data_np=None)[source]
@@ -150,6 +161,11 @@

d a t a _ j u i c e r . c o r eclass data_juicer.core.data.NestedDataset(*args, **kargs)[source]

Bases: Dataset

Enhanced HuggingFace-Dataset for better usability and efficiency.

+
+
+__init__(*args, **kargs)[source]
+
+
add_column(*args, **kargs)[source]
@@ -214,6 +230,11 @@

d a t a _ j u i c e r . c o r eclass data_juicer.core.data.NestedDatasetDict(*args, **kargs)[source]

Bases: DatasetDict

Enhanced HuggingFace-DatasetDict for better usability and efficiency.

+
+
+__init__(*args, **kargs)[source]
+
+
map(**args)[source]
@@ -228,6 +249,11 @@

d a t a _ j u i c e r . c o r eclass data_juicer.core.data.NestedQueryDict(*args, **kargs)[source]

Bases: dict

Enhanced dict for better usability.

+
+
+__init__(*args, **kargs)[source]
+
+

@@ -288,6 +314,17 @@

d a t a _ j u i c e r . c o r e +
+__init__(cfg=None)[source]
+

Initialization method.

+
+
Parameters:
+

cfg – optional config dict.

+
+
+

+
run(load_data_np=None)[source]
@@ -333,6 +370,25 @@

d a t a _ j u i c e r . c o r eTiB = 1099511627776

+
+
+__init__(export_path, export_shard_size=0, export_in_parallel=True, num_proc=1, export_ds=True, export_stats=True)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • export_path – the path to export datasets.

  • +
  • export_shard_size – the size of each shard of exported +dataset. In default, it’s 0, which means export the dataset +to a single file.

  • +
  • num_proc – number of process to export the dataset.

  • +
  • export_ds – whether to export the dataset contents.

  • +
  • export_stats – whether to export the stats of dataset.

  • +
+
+
+
+
export(dataset)[source]
@@ -402,6 +458,17 @@

d a t a _ j u i c e r . c o r e +
+__init__(cfg=None)[source]
+

Initialization method.

+
+
Parameters:
+

cfg – optional config dict.

+
+
+

+
run(load_data_np=None)[source]
@@ -428,6 +495,22 @@

d a t a _ j u i c e r . c o r e +
+__init__(work_dir, show_num=10)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • work_dir – the work directory to store the comparison +results

  • +
  • show_num – the maximum number of samples to show in the +comparison result files.

  • +
+
+
+

+
trace_batch_mapper(op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str)[source]
diff --git a/data_juicer.format.html b/data_juicer.format.html index 37f558120..32d3cbffb 100644 --- a/data_juicer.format.html +++ b/data_juicer.format.html @@ -137,6 +137,21 @@

d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.csv']

+
+
+__init__(dataset_path, suffixes=None, **kwargs)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset_path – a dataset file or a dataset directory

  • +
  • suffixes – files with specified suffixes to be processed

  • +
  • kwargs – extra args

  • +
+
+
+
+

@@ -160,6 +175,27 @@

d a t a _ j u i c e r . f o r m a tBaseFormatter

The class is used to load a dataset from local files or local directory.

+
+
+__init__(dataset_path: str, type: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset_path – path to a dataset file or a dataset +directory

  • +
  • type – a packaged dataset module type (json, csv, etc.)

  • +
  • suffixes – files with specified suffixes to be processed

  • +
  • text_keys – key names of field that stores sample +text.

  • +
  • add_suffix – whether to add the file suffix to dataset +meta info

  • +
  • kwargs – extra args

  • +
+
+
+
+
load_dataset(num_proc: int = 1) Dataset[source]
@@ -186,6 +222,22 @@

d a t a _ j u i c e r . f o r m a tBaseFormatter

The class is used to load a dataset from repository of huggingface hub.

+
+
+__init__(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset_path – a dataset file or a dataset directory

  • +
  • text_keys – key names of field that stores sample +text.

  • +
  • kwargs – extra args

  • +
+
+
+
+
load_dataset(num_proc: int = 1) Dataset[source]
@@ -277,6 +329,21 @@

d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.json', '.jsonl', '.jsonl.zst']

+
+
+__init__(dataset_path, suffixes=None, **kwargs)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset_path – a dataset file or a dataset directory

  • +
  • suffixes – files with specified suffixes to be processed

  • +
  • kwargs – extra args

  • +
+
+
+
+

@@ -314,6 +381,26 @@

d a t a _ j u i c e r . f o r m a t +
+__init__(dataset_path: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys=None, add_suffix=False, **kwargs)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset_path – a dataset file or a dataset dir or a list +of them, optional weights, default 1.0 e.g. <w1> ds.jsonl +<w2> ds_dir <w3> ds_file.json

  • +
  • suffixes – files with specified suffixes to be processed

  • +
  • text_keys – key names of field that stores sample text.

  • +
  • add_suffix – whether to add the file suffix to dataset +meta info

  • +
  • kwargs – extra args

  • +
+
+
+

+
load_dataset(num_proc: int = 1) Dataset[source]
@@ -344,6 +431,21 @@

d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.parquet']

+
+
+__init__(dataset_path, suffixes=None, **kwargs)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset_path – a dataset file or a dataset directory

  • +
  • suffixes – files with specified suffixes to be processed

  • +
  • kwargs – extra args

  • +
+
+
+
+

@@ -360,6 +462,23 @@

d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', '.m', '.smali']
+
+
+__init__(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset_path – a dataset file or a dataset directory

  • +
  • suffixes – files with specified suffixes to be processed

  • +
  • add_suffix – Whether to add file suffix to datase meta +info

  • +
  • kwargs – extra args

  • +
+
+
+
+
load_dataset(num_proc: int = 1) Dataset[source]
@@ -418,6 +537,21 @@

d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.tsv']

+
+
+__init__(dataset_path, suffixes=None, **kwargs)[source]
+

Initialization method.

+
+
Parameters:
+
    +
  • dataset_path – a dataset file or a dataset directory

  • +
  • suffixes – files with specified suffixes to be processed

  • +
  • kwargs – extra args, e.g. delimiter = ‘,’

  • +
+
+
+
+ diff --git a/data_juicer.html b/data_juicer.html index 43f8860d6..9ba6948e0 100644 --- a/data_juicer.html +++ b/data_juicer.html @@ -87,6 +87,7 @@

d a t a _ j u i c e rd a t a _ j u i c e r . a n a l y s i s
  • data_juicer.analysis.column_wise_analysis
    • ColumnWiseAnalysis
        +
      • ColumnWiseAnalysis.__init__()
      • ColumnWiseAnalysis.analyse()
      • ColumnWiseAnalysis.draw_box()
      • ColumnWiseAnalysis.draw_hist()
      • @@ -97,6 +98,7 @@

        d a t a _ j u i c e r
      • data_juicer.analysis.diversity_analysis
        • DiversityAnalysis @@ -108,6 +110,7 @@

          d a t a _ j u i c e r
        • data_juicer.analysis.overall_analysis
          • OverallAnalysis
          • @@ -129,6 +132,7 @@

            d a t a _ j u i c e rd a t a _ j u i c e r . c o r e
            • data_juicer.core.analyser
              • Analyser
              • @@ -136,6 +140,7 @@

                d a t a _ j u i c e r
              • data_juicer.core.data
                • NestedDataset
                    +
                  • NestedDataset.__init__()
                  • NestedDataset.add_column()
                  • NestedDataset.cleanup_cache_files()
                  • NestedDataset.filter()
                  • @@ -147,10 +152,14 @@

                    d a t a _ j u i c e r
                  • NestedDatasetDict
                  • -
                  • NestedQueryDict
                  • +
                  • NestedQueryDict +
                  • nested_obj_factory()
                  • nested_query()
                  • wrap_func_with_nested_access()
                  • @@ -158,6 +167,7 @@

                    d a t a _ j u i c e r
                  • data_juicer.core.executor
                    • Executor
                    • @@ -169,6 +179,7 @@

                      d a t a _ j u i c e rExporter.KiB
                    • Exporter.MiB
                    • Exporter.TiB
                    • +
                    • Exporter.__init__()
                    • Exporter.export()
                    • Exporter.to_jsonl()
                    • Exporter.to_parquet()
                    • @@ -178,6 +189,7 @@

                      d a t a _ j u i c e r
                    • data_juicer.core.ray_executor
                      • RayExecutor
                      • @@ -185,6 +197,7 @@

                        d a t a _ j u i c e r
                      • data_juicer.core.tracer
                        • Tracer
                            +
                          • Tracer.__init__()
                          • Tracer.trace_batch_mapper()
                          • Tracer.trace_deduplicator()
                          • Tracer.trace_filter()
                          • @@ -199,6 +212,7 @@

                            d a t a _ j u i c e rdata_juicer.format.csv_formatter @@ -209,10 +223,12 @@

                            d a t a _ j u i c e r
                          • LocalFormatter
                          • RemoteFormatter
                          • @@ -224,6 +240,7 @@

                            d a t a _ j u i c e rdata_juicer.format.json_formatter @@ -234,6 +251,7 @@

                            d a t a _ j u i c e r
                          • data_juicer.format.mixture_formatter
                            • MixtureFormatter
                            • @@ -242,6 +260,7 @@

                              d a t a _ j u i c e rdata_juicer.format.parquet_formatter @@ -249,6 +268,7 @@

                              d a t a _ j u i c e rdata_juicer.format.text_formatter
                              • TextFormatter
                              • @@ -259,6 +279,7 @@

                                d a t a _ j u i c e rdata_juicer.format.tsv_formatter @@ -462,21 +483,25 @@

                                d a t a _ j u i c e r
                              • data_juicer.ops.base_op
                                • Deduplicator
                                • Filter
                                • Mapper
                                • Selector
                                • @@ -488,6 +513,7 @@

                                  d a t a _ j u i c e r
                                • data_juicer.ops.op_fusion
                                  • FusedFilter @@ -507,6 +533,7 @@

                                    d a t a _ j u i c e rdata_juicer.utils.cache_utils

                                  • data_juicer.utils.ckpt_utils
                                    • CheckpointManager
                                        +
                                      • CheckpointManager.__init__()
                                      • CheckpointManager.check_ckpt()
                                      • CheckpointManager.check_ops_to_skip()
                                      • CheckpointManager.get_left_process_list()
                                      • @@ -523,6 +550,7 @@

                                        d a t a _ j u i c e r
                                      • CacheCompressManager
                                          +
                                        • CacheCompressManager.__init__()
                                        • CacheCompressManager.cleanup_cache_files()
                                        • CacheCompressManager.compress()
                                        • CacheCompressManager.decompress()
                                        • @@ -530,6 +558,7 @@

                                          d a t a _ j u i c e r
                                        • CompressManager @@ -609,6 +638,7 @@

                                          d a t a _ j u i c e r
                                        • data_juicer.utils.fingerprint_utils
                                          • Hasher
                                              +
                                            • Hasher.__init__()
                                            • Hasher.dispatch
                                            • Hasher.hash()
                                            • Hasher.hash_bytes()
                                            • @@ -624,6 +654,7 @@

                                              d a t a _ j u i c e rdata_juicer.utils.logger_utils
                                              • HiddenPrints
                                              • StreamToLoguru @@ -648,6 +679,7 @@

                                                d a t a _ j u i c e r
                                              • data_juicer.utils.registry
                                                • Registry
                                                    +
                                                  • Registry.__init__()
                                                  • Registry.get()
                                                  • Registry.list()
                                                  • Registry.modules
                                                  • diff --git a/data_juicer.ops.common.html b/data_juicer.ops.common.html index 0827112a3..7dbfcbb90 100644 --- a/data_juicer.ops.common.html +++ b/data_juicer.ops.common.html @@ -104,6 +104,12 @@

                                                    d a t a _ j u i c e r . o p s . c o m m o n class data_juicer.ops.common.helper_func.UnionFind[source]

                                                    Bases: object

                                                    +
                                                    +
                                                    +__init__()[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    find(x)[source]
                                                    diff --git a/data_juicer.ops.deduplicator.html b/data_juicer.ops.deduplicator.html index 0d2ddda8d..4fe8d8b74 100644 --- a/data_juicer.ops.deduplicator.html +++ b/data_juicer.ops.deduplicator.html @@ -107,6 +107,23 @@

                                                    d a t a _ j u i c e r . o p s . d e d u p l i c a t o r

                                                    Bases: Deduplicator

                                                    Deduplicator to deduplicate samples at document-level using exact matching.

                                                    Using md5 hash to deduplicate samples.

                                                    +
                                                    +
                                                    +__init__(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • lowercase – Whether to convert sample text to lower case

                                                    • +
                                                    • ignore_non_character – Whether to ignore non-alphabet +characters, including whitespaces, digits, and punctuations

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args.

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_hash(sample)[source]
                                                    @@ -151,6 +168,40 @@

                                                    d a t a _ j u i c e r . o p s . d e d u p l i c a t o rDeduplicator to deduplicate samples at document-level using MinHashLSH.

                                                    Different from simhash, minhash is stored as bytes, so they won’t be kept in the final dataset.

                                                    +
                                                    +
                                                    +__init__(tokenization: str = 'space', window_size: PositiveInt = 5, lowercase: bool = True, ignore_pattern: str | None = None, num_permutations: PositiveInt = 256, jaccard_threshold: ClosedUnitInterval = 0.7, num_bands: PositiveInt | None = None, num_rows_per_band: PositiveInt | None = None, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • tokenization – tokenization method for sample texts. It +should be one of [space, punctuation, character]. For +English-like languages, we recommend to use ‘space’. And for +Chinese-like languages, we recommend to use ‘character’

                                                    • +
                                                    • window_size – window size of shingling

                                                    • +
                                                    • lowercase – whether to convert text to lower case first

                                                    • +
                                                    • ignore_pattern – whether to ignore sub-strings with +specific pattern when computing minhash

                                                    • +
                                                    • num_permutations – number of permutations in minhash +computing

                                                    • +
                                                    • jaccard_threshold – the min jaccard similarity threshold +in near-duplicate detection. When the jaccard similarity of +two sample texts is >= this threshold, they are regarded as +similar samples and this op will only keep one of them after +deduplication

                                                    • +
                                                    • num_bands – number of bands in LSH. Default it’s None, and +it will be determined by an optimal params computation +algorithm by minimize the weighted sum of probs of False +Positives and False Negatives

                                                    • +
                                                    • num_rows_per_band – number of rows in each band in LSH. +Default it’s None, and it will be determined by an optimal +params computation algorithm

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_hash(sample)[source]
                                                    @@ -229,6 +280,33 @@

                                                    d a t a _ j u i c e r . o p s . d e d u p l i c a t o rclass data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator(tokenization: str = 'space', window_size: PositiveInt = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: PositiveInt = 6, hamming_distance: PositiveInt = 4, *args, **kwargs)[source]

                                                    Bases: Deduplicator

                                                    Deduplicator to deduplicate samples at document-level using SimHash.

                                                    +
                                                    +
                                                    +__init__(tokenization: str = 'space', window_size: PositiveInt = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: PositiveInt = 6, hamming_distance: PositiveInt = 4, *args, **kwargs)[source]
                                                    +

                                                    Initialization method :param tokenization: tokenization method for +sample texts.

                                                    +

                                                    It should be one of [space, punctuation, character]. For +English-like languages, we recommend to use ‘space’. And for +Chinese-like languages, we recommend to use ‘character’

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • window_size – window size of shingling

                                                    • +
                                                    • lowercase – whether to convert text to lower case first

                                                    • +
                                                    • ignore_pattern – whether to ignore sub-strings with +specific pattern when computing simhash

                                                    • +
                                                    • num_blocks – number of blocks in simhash computing

                                                    • +
                                                    • hamming_distance – the max hamming distance threshold in +near-duplicate detection. When the hamming distance of two +sample texts is <= this threshold, they are regarded as +similar samples and this op will only keep one of them after +deduplication. This threshold should be always less than +num_blocks

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_hash(sample)[source]
                                                    diff --git a/data_juicer.ops.filter.html b/data_juicer.ops.filter.html index 38e4082c5..dec516ab6 100644 --- a/data_juicer.ops.filter.html +++ b/data_juicer.ops.filter.html @@ -120,6 +120,30 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with alphabet/numeric ratio within a specific range.

                                                    +
                                                    +
                                                    +__init__(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: PositiveFloat = 9223372036854775807, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • tokenization – Whether to count the ratio of alphanumeric +to the total number of tokens. if tokenization=False, it +will count the ratio of alphanumeric to the total number of +characters.

                                                    • +
                                                    • min_ratio – The min filter ratio in alphanumeric op, +samples will be filtered if their alphabet/numeric ratio is +below this parameter.

                                                    • +
                                                    • max_ratio – The max filter ratio in alphanumeric op, +samples will be filtered if their alphabet/numeric ratio +exceeds this parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -164,6 +188,26 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with average line length within a specific range.

                                                    +
                                                    +
                                                    +__init__(min_len: PositiveInt = 10, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • min_len – The min filter length in this op, samples will +be filtered if their average line length is below this +parameter.

                                                    • +
                                                    • max_len – The max filter length in this op, samples will +be filtered if their average line length exceeds this +parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample, context=False)[source]
                                                    @@ -208,6 +252,27 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with char-level n-gram repetition ratio within a specific range.

                                                    +
                                                    +
                                                    +__init__(rep_len: PositiveInt = 10, min_ratio: ClosedUnitInterval = 0.0, max_ratio: ClosedUnitInterval = 0.5, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • rep_len – Repetition length for char-level n-gram.

                                                    • +
                                                    • min_ratio – The min filter ratio in this op, samples will +be filtered if their char-level n-gram repetition ratio is +below this parameter.

                                                    • +
                                                    • max_ratio – The max filter ratio in this op, samples will +be filtered if their char-level n-gram repetition ratio +exceeds this parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -252,6 +317,33 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with flagged-word ratio less than a specific max value.

                                                    +
                                                    +
                                                    +__init__(lang: str = 'en', tokenization: bool = False, max_ratio: ClosedUnitInterval = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • lang – Consider flagged words in what language. If lang == +“all”, we will adopt the one merged from all the available +languages

                                                    • +
                                                    • tokenization – Whether to use model to tokenize documents

                                                    • +
                                                    • max_ratio – The max filter ratio in this op.

                                                    • +
                                                    • flagged_words_dir – The directory storing the +flagged_words file(s) whose name includes “flagged_words” +and in json format

                                                    • +
                                                    • use_words_aug – Whether to augment words, especially for +Chinese and Vietnamese

                                                    • +
                                                    • words_aug_group_sizes – The group size of words to augment

                                                    • +
                                                    • words_aug_join_char – The join char between words to +augment

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample, context=False)[source]
                                                    @@ -296,6 +388,23 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples in a specific language with confidence score larger than a specific min value.

                                                    +
                                                    +
                                                    +__init__(lang: str = '', min_score: ClosedUnitInterval = 0.8, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • lang – Samples in which language to keep.

                                                    • +
                                                    • min_score – The min language identification confidence +scores of samples to keep.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -340,6 +449,26 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with maximum line length within a specific range.

                                                    +
                                                    +
                                                    +__init__(min_len: PositiveInt = 10, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • min_len – The min filter length in this op, samples will +be filtered if their maximum line length is below this +parameter.

                                                    • +
                                                    • max_len – The max filter length in this op, samples will +be filtered if their maximum line length exceeds this +parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample, context=False)[source]
                                                    @@ -384,6 +513,23 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with perplexity score less than a specific max value.

                                                    +
                                                    +
                                                    +__init__(lang: str = 'en', max_ppl: PositiveFloat = 1500, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • lang – Compute perplexity for samples in which language.

                                                    • +
                                                    • max_ppl – The max filter perplexity in this op, samples +will be filtered if their perplexity exceeds this parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample, context=False)[source]
                                                    @@ -428,6 +574,26 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with special-char ratio within a specific range.

                                                    +
                                                    +
                                                    +__init__(min_ratio: ClosedUnitInterval = 0.0, max_ratio: ClosedUnitInterval = 0.25, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • min_ratio – The min filter ratio in this op, samples will +be filtered if their special-char ratio is below this +parameter.

                                                    • +
                                                    • max_ratio – The max filter ratio in this op, samples will +be filtered if their special-char ratio exceeds this +parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -473,6 +639,26 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e r +
                                                    +__init__(field_key: str = '', target_value: List | Tuple = [], *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • field_key – Filter based on the specified value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

                                                    • +
                                                    • target_value – The range of specified field information +corresponding to the samples that need to be retained.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +

                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -518,6 +704,30 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e r +
                                                    +__init__(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • field_key – Filter based on the specified numeric value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

                                                    • +
                                                    • min_value – The min filter value in SpecifiedNumericField +op, samples will be filtered if their specified numeric +field value is below this parameter.

                                                    • +
                                                    • max_value – The max filter value in SpecifiedNumericField +op, samples will be filtered if their specified numeric +field value exceeds this parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +

                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -567,6 +777,32 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with stopword ratio larger than a specific min value.

                                                    +
                                                    +
                                                    +__init__(lang: str = 'en', tokenization: bool = False, min_ratio: ClosedUnitInterval = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • lang – Consider stopwords in what language. If lang == +“all”, we will adopt the one merged from all the available +languages

                                                    • +
                                                    • tokenization – whether to use model to tokenize documents

                                                    • +
                                                    • min_ratio – The min filter ratio in this op.

                                                    • +
                                                    • stopwords_dir – The directory storing the stopwords +file(s) whose name includes “stopwords” and in json format

                                                    • +
                                                    • use_words_aug – Whether to augment words, especially for +Chinese and Vietnamese

                                                    • +
                                                    • words_aug_group_sizes – The group size of words to augment

                                                    • +
                                                    • words_aug_join_char – The join char between words to +augment

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample, context=False)[source]
                                                    @@ -610,6 +846,22 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rclass data_juicer.ops.filter.suffix_filter.SuffixFilter(suffixes: str | List[str] | Tuple[str] = [], *args, **kwargs)[source]

                                                    Bases: Filter

                                                    Filter to keep samples with specified suffix.

                                                    +
                                                    +
                                                    +__init__(suffixes: str | List[str] | Tuple[str] = [], *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • suffixes – the suffix of text that will be keep. +For example: ‘.txt’, ‘txt’ or [‘txt’, ‘.pdf’, ‘docx’]

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -654,6 +906,26 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with total text length within a specific range.

                                                    +
                                                    +
                                                    +__init__(min_len: PositiveInt = 10, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • min_len – The min text length in the filtering. samples +will be filtered if their text length is below this +parameter.

                                                    • +
                                                    • max_len – The max text length in the filtering. samples +will be filtered if their text length exceeds this +parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -698,6 +970,27 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with total token number within a specific range.

                                                    +
                                                    +
                                                    +__init__(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: PositiveInt = 10, max_num: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • hf_tokenizer – the tokenizer name of Hugging Face tokenizers.

                                                    • +
                                                    • min_num – The min filter token number in this op, samples +will be filtered if their token number is below this +parameter.

                                                    • +
                                                    • max_num – The max filter token number in this op, samples +will be filtered if their token number exceeds this +parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample)[source]
                                                    @@ -742,6 +1035,28 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with total words number within a specific range.

                                                    +
                                                    +
                                                    +__init__(lang: str = 'en', tokenization: bool = False, min_num: PositiveInt = 10, max_num: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • lang – sample in which language.

                                                    • +
                                                    • tokenization – whether to use model to tokenize documents

                                                    • +
                                                    • min_num – The min filter word number in this op, samples +will be filtered if their word number is below this +parameter.

                                                    • +
                                                    • max_num – The max filter word number in this op, samples +will be filtered if their word number exceeds this +parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample, context=False)[source]
                                                    @@ -786,6 +1101,29 @@

                                                    d a t a _ j u i c e r . o p s . f i l t e rFilter

                                                    Filter to keep samples with word-level n-gram repetition ratio within a specific range.

                                                    +
                                                    +
                                                    +__init__(lang: str = 'en', tokenization: bool = False, rep_len: PositiveInt = 10, min_ratio: ClosedUnitInterval = 0.0, max_ratio: ClosedUnitInterval = 0.5, *args, **kwargs)[source]
                                                    +

                                                    Initialization method.

                                                    +
                                                    +
                                                    Parameters:
                                                    +
                                                      +
                                                    • lang – sample in which language.

                                                    • +
                                                    • tokenization – whether to use model to tokenize documents

                                                    • +
                                                    • rep_len – Repetition length for word-level n-gram.

                                                    • +
                                                    • min_ratio – The min filter ratio in this op, samples will +be filtered if their word-level n-gram repetition ratio is +below this parameter.

                                                    • +
                                                    • max_ratio – The max filter ratio in this op, samples will +be filtered if their word-level n-gram repetition ratio +exceeds this parameter.

                                                    • +
                                                    • args – extra args

                                                    • +
                                                    • kwargs – extra args

                                                    • +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    compute_stats(sample, context=False)[source]
                                                    diff --git a/data_juicer.ops.html b/data_juicer.ops.html index 2a294f536..1d769fe98 100644 --- a/data_juicer.ops.html +++ b/data_juicer.ops.html @@ -112,6 +112,7 @@

                                                    d a t a _ j u i c e r . o p sd a t a _ j u i c e r . o p s . c o m m o n
                                                    • data_juicer.ops.common.helper_func
                                                      • UnionFind @@ -132,6 +133,7 @@

                                                        d a t a _ j u i c e r . o p sd a t a _ j u i c e r . o p s . d e d u p l i c a t o r
                                                        • data_juicer.ops.deduplicator.document_deduplicator
                                                          • DocumentDeduplicator @@ -140,6 +142,7 @@

                                                            d a t a _ j u i c e r . o p sdata_juicer.ops.deduplicator.document_minhash_deduplicator
                                                            • DocumentMinhashDeduplicator @@ -150,6 +153,7 @@

                                                              d a t a _ j u i c e r . o p sdata_juicer.ops.deduplicator.document_simhash_deduplicator
                                                              • DocumentSimhashDeduplicator @@ -163,6 +167,7 @@

                                                                d a t a _ j u i c e r . o p sd a t a _ j u i c e r . o p s . f i l t e r
                                                                • data_juicer.ops.filter.alphanumeric_filter
                                                                  • AlphanumericFilter @@ -171,6 +176,7 @@

                                                                    d a t a _ j u i c e r . o p sdata_juicer.ops.filter.average_line_length_filter
                                                                    • AverageLineLengthFilter @@ -179,6 +185,7 @@

                                                                      d a t a _ j u i c e r . o p sdata_juicer.ops.filter.character_repetition_filter
                                                                      • CharacterRepetitionFilter @@ -187,6 +194,7 @@

                                                                        d a t a _ j u i c e r . o p sdata_juicer.ops.filter.flagged_words_filter
                                                                        • FlaggedWordFilter @@ -195,6 +203,7 @@

                                                                          d a t a _ j u i c e r . o p sdata_juicer.ops.filter.language_id_score_filter
                                                                          • LanguageIDScoreFilter @@ -203,6 +212,7 @@

                                                                            d a t a _ j u i c e r . o p sdata_juicer.ops.filter.maximum_line_length_filter
                                                                            • MaximumLineLengthFilter @@ -211,6 +221,7 @@

                                                                              d a t a _ j u i c e r . o p sdata_juicer.ops.filter.perplexity_filter
                                                                              • PerplexityFilter @@ -219,6 +230,7 @@

                                                                                d a t a _ j u i c e r . o p sdata_juicer.ops.filter.special_characters_filter
                                                                                • SpecialCharactersFilter @@ -227,6 +239,7 @@

                                                                                  d a t a _ j u i c e r . o p sdata_juicer.ops.filter.specified_field_filter
                                                                                  • SpecifiedFieldFilter @@ -235,6 +248,7 @@

                                                                                    d a t a _ j u i c e r . o p sdata_juicer.ops.filter.specified_numeric_field_filter
                                                                                    • SpecifiedNumericFieldFilter @@ -244,6 +258,7 @@

                                                                                      d a t a _ j u i c e r . o p sdata_juicer.ops.filter.stopwords_filter
                                                                                      • StopWordsFilter @@ -252,6 +267,7 @@

                                                                                        d a t a _ j u i c e r . o p sdata_juicer.ops.filter.suffix_filter
                                                                                        • SuffixFilter @@ -260,6 +276,7 @@

                                                                                          d a t a _ j u i c e r . o p sdata_juicer.ops.filter.text_length_filter
                                                                                          • TextLengthFilter @@ -268,6 +285,7 @@

                                                                                            d a t a _ j u i c e r . o p sdata_juicer.ops.filter.token_num_filter
                                                                                            • TokenNumFilter @@ -276,6 +294,7 @@

                                                                                              d a t a _ j u i c e r . o p sdata_juicer.ops.filter.word_num_filter
                                                                                              • WordNumFilter @@ -284,6 +303,7 @@

                                                                                                d a t a _ j u i c e r . o p sdata_juicer.ops.filter.word_repetition_filter
                                                                                                • WordRepetitionFilter @@ -295,6 +315,7 @@

                                                                                                  d a t a _ j u i c e r . o p sd a t a _ j u i c e r . o p s . m a p p e r
                                                                                                  • data_juicer.ops.mapper.clean_copyright_mapper
                                                                                                    • CleanCopyrightMapper
                                                                                                    • @@ -302,6 +323,7 @@

                                                                                                      d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.clean_email_mapper
                                                                                                      • CleanEmailMapper
                                                                                                      • @@ -309,6 +331,7 @@

                                                                                                        d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.clean_html_mapper
                                                                                                        • CleanHtmlMapper
                                                                                                        • @@ -316,6 +339,7 @@

                                                                                                          d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.clean_ip_mapper
                                                                                                          • CleanIpMapper
                                                                                                          • @@ -323,6 +347,7 @@

                                                                                                            d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.clean_links_mapper
                                                                                                            • CleanLinksMapper
                                                                                                            • @@ -330,6 +355,7 @@

                                                                                                              d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.expand_macro_mapper
                                                                                                              • ExpandMacroMapper
                                                                                                              • @@ -337,6 +363,7 @@

                                                                                                                d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.fix_unicode_mapper
                                                                                                                • FixUnicodeMapper
                                                                                                                • @@ -344,6 +371,7 @@

                                                                                                                  d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.nlpaug_en_mapper
                                                                                                                  • NlpaugEnMapper
                                                                                                                  • @@ -351,6 +379,7 @@

                                                                                                                    d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.nlpcda_zh_mapper
                                                                                                                    • NlpcdaZhMapper
                                                                                                                    • @@ -358,6 +387,7 @@

                                                                                                                      d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.punctuation_normalization_mapper
                                                                                                                      • PunctuationNormalizationMapper
                                                                                                                      • @@ -365,6 +395,7 @@

                                                                                                                        d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.remove_bibliography_mapper
                                                                                                                        • RemoveBibliographyMapper
                                                                                                                        • @@ -372,6 +403,7 @@

                                                                                                                          d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.remove_comments_mapper
                                                                                                                          • RemoveCommentsMapper
                                                                                                                          • @@ -379,6 +411,7 @@

                                                                                                                            d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.remove_header_mapper
                                                                                                                            • RemoveHeaderMapper
                                                                                                                            • @@ -386,6 +419,7 @@

                                                                                                                              d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.remove_long_words_mapper
                                                                                                                              • RemoveLongWordsMapper @@ -394,6 +428,7 @@

                                                                                                                                d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.remove_specific_chars_mapper
                                                                                                                                • RemoveSpecificCharsMapper
                                                                                                                                • @@ -401,6 +436,7 @@

                                                                                                                                  d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.remove_table_text_mapper
                                                                                                                                  • RemoveTableTextMapper
                                                                                                                                  • @@ -408,6 +444,7 @@

                                                                                                                                    d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper
                                                                                                                                    • RemoveWordsWithIncorrectSubstringsMapper @@ -416,6 +453,7 @@

                                                                                                                                      d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.sentence_split_mapper
                                                                                                                                      • SentenceSplitMapper
                                                                                                                                      • @@ -423,6 +461,7 @@

                                                                                                                                        d a t a _ j u i c e r . o p sdata_juicer.ops.mapper.whitespace_normalization_mapper
                                                                                                                                        • WhitespaceNormalizationMapper
                                                                                                                                        • @@ -433,6 +472,7 @@

                                                                                                                                          d a t a _ j u i c e r . o p sd a t a _ j u i c e r . o p s . s e l e c t o r
                                                                                                                                          • data_juicer.ops.selector.frequency_specified_field_selector
                                                                                                                                            • FrequencySpecifiedFieldSelector
                                                                                                                                            • @@ -440,6 +480,7 @@

                                                                                                                                              d a t a _ j u i c e r . o p sdata_juicer.ops.selector.topk_specified_field_selector
                                                                                                                                              • TopkSpecifiedFieldSelector
                                                                                                                                              • @@ -456,6 +497,18 @@

                                                                                                                                                d a t a _ j u i c e r . o p s class data_juicer.ops.base_op.Deduplicator(text_key: str | None = None)[source]

                                                                                                                                                Bases: object

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(text_key: str | None = None)[source]
                                                                                                                                                +

                                                                                                                                                Base class that conducts deduplication.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +

                                                                                                                                                text_key – the key name of field that stores sample texts +to be processed

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                compute_hash(sample)[source]
                                                                                                                                                @@ -494,6 +547,18 @@

                                                                                                                                                d a t a _ j u i c e r . o p s class data_juicer.ops.base_op.Filter(text_key: str | None = None)[source]

                                                                                                                                                Bases: object

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(text_key: str | None = None)[source]
                                                                                                                                                +

                                                                                                                                                Base class that removes specific info.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +

                                                                                                                                                text_key – the key name of field that stores sample texts +to be processed

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                compute_stats(sample, context=False)[source]
                                                                                                                                                @@ -533,6 +598,18 @@

                                                                                                                                                d a t a _ j u i c e r . o p s class data_juicer.ops.base_op.Mapper(text_key: str | None = None)[source]

                                                                                                                                                Bases: object

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(text_key: str | None = None)[source]
                                                                                                                                                +

                                                                                                                                                Base class that conducts text editing.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +

                                                                                                                                                text_key – the key name of field that stores sample texts +to be processed.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                is_batched_op()[source]
                                                                                                                                                @@ -558,6 +635,18 @@

                                                                                                                                                d a t a _ j u i c e r . o p s class data_juicer.ops.base_op.Selector(text_key: str | None = None)[source]

                                                                                                                                                Bases: object

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(text_key: str | None = None)[source]
                                                                                                                                                +

                                                                                                                                                Base class that conducts selection in dataset-level.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +

                                                                                                                                                text_key – the key name of field that stores sample texts +to be processed

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(dataset)[source]
                                                                                                                                                @@ -604,6 +693,17 @@

                                                                                                                                                d a t a _ j u i c e r . o p sclass data_juicer.ops.op_fusion.FusedFilter(fused_filters: List)[source]

                                                                                                                                                Bases: Filter

                                                                                                                                                A fused operator for filters.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(fused_filters: List)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +

                                                                                                                                                fused_filers – a list of filters to be fused.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                compute_stats(sample)[source]
                                                                                                                                                diff --git a/data_juicer.ops.mapper.html b/data_juicer.ops.mapper.html index 99ad1cddf..9d036bc7b 100644 --- a/data_juicer.ops.mapper.html +++ b/data_juicer.ops.mapper.html @@ -123,6 +123,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rMapper

                                                                                                                                                Mapper to clean copyright comments at the beginning of the text samples.

                                                                                                                                                +
                                                                                                                                                + +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                @@ -147,6 +161,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper(*args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to clean email in text samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(*args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -171,6 +199,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper(*args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to clean html code in text samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(*args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -195,6 +237,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper(*args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to clean ipv4 and ipv6 address in text samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(*args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -219,6 +275,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper(*args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to clean links like http/https/ftp in text samples.

                                                                                                                                                +
                                                                                                                                                + +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                @@ -244,6 +314,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rMapper

                                                                                                                                                Mapper to expand macro definitions in the document body of Latex samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(*args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -268,6 +352,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper(*args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to fix unicode errors in text samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(*args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -292,6 +390,59 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper(sequential: bool = False, aug_num: int = 1, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to simply augment samples in English based on nlpaug library.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(sequential: bool = False, aug_num: int = 1, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method. All augmentation methods use default parameters +in default. We recommend you to only use 1-3 augmentation methods at a +time. Otherwise, the semantics of samples might be changed +significantly.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • sequential – whether combine all augmentation methods to a +sequence. If it’s True, a sample will be augmented by all opened +augmentation methods sequentially. If it’s False, each opened +augmentation method would generate its augmented samples +independently.

                                                                                                                                                • +
                                                                                                                                                • aug_num – number of augmented samples to be generated. If +sequential is True, there will be total aug_num augmented samples +generated. If it’s False, there will be (aug_num * +#opened_aug_method) augmented samples generated.

                                                                                                                                                • +
                                                                                                                                                • delete_random_word – whether to open the augmentation method of +deleting random words from the original texts. e.g. “I love LLM” +–> “I LLM”

                                                                                                                                                • +
                                                                                                                                                • swap_random_word – whether to open the augmentation method of +swapping random contiguous words in the original texts. e.g. “I +love LLM” –> “Love I LLM”

                                                                                                                                                • +
                                                                                                                                                • spelling_error_word – whether to open the augmentation method of +simulating the spelling error for words in the original texts. e.g. +“I love LLM” –> “Ai love LLM”

                                                                                                                                                • +
                                                                                                                                                • split_random_word – whether to open the augmentation method of +splitting words randomly with whitespaces in the original texts. +e.g. “I love LLM” –> “I love LL M”

                                                                                                                                                • +
                                                                                                                                                • keyboard_error_char – whether to open the augmentation method of +simulating the keyboard error for characters in the original texts. +e.g. “I love LLM” –> “I ;ov4 LLM”

                                                                                                                                                • +
                                                                                                                                                • ocr_error_char – whether to open the augmentation method of +simulating the OCR error for characters in the original texts. +e.g. “I love LLM” –> “I 10ve LLM”

                                                                                                                                                • +
                                                                                                                                                • delete_random_char – whether to open the augmentation method of +deleting random characters from the original texts. e.g. “I love +LLM” –> “I oe LLM”

                                                                                                                                                • +
                                                                                                                                                • swap_random_char – whether to open the augmentation method of +swapping random contiguous characters in the original texts. +e.g. “I love LLM” –> “I ovle LLM”

                                                                                                                                                • +
                                                                                                                                                • insert_random_char – whether to open the augmentation method of +inserting random characters into the original texts. e.g. “I love +LLM” –> “I ^lKove LLM”

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(samples)[source]
                                                                                                                                                @@ -316,6 +467,49 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper(sequential: bool = False, aug_num: int = 1, replace_similar_word: bool = False, replace_homophone_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, replace_equivalent_num: bool = False, *args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to simply augment samples in Chinese based on nlpcda library.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(sequential: bool = False, aug_num: int = 1, replace_similar_word: bool = False, replace_homophone_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, replace_equivalent_num: bool = False, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method. All augmentation methods use default parameters +in default. We recommend you to only use 1-3 augmentation methods at a +time. Otherwise, the semantics of samples might be changed +significantly. Notice: some augmentation method might not work for +some special texts, so there might be no augmented texts generated.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • sequential – whether combine all augmentation methods to a +sequence. If it’s True, a sample will be augmented by all opened +augmentation methods sequentially. If it’s False, each opened +augmentation method would generate its augmented samples +independently.

                                                                                                                                                • +
                                                                                                                                                • aug_num – number of augmented samples to be generated. If +sequential is True, there will be total aug_num augmented samples +generated. If it’s False, there will be (aug_num * +#opened_aug_method) augmented samples generated.

                                                                                                                                                • +
                                                                                                                                                • replace_similar_word – whether to open the augmentation method of +replacing random words with their similar words in the original +texts. e.g. “这里一共有5种不同的数据增强方法” –> “这边一共有5种不同的数据增强方法”

                                                                                                                                                • +
                                                                                                                                                • replace_homophone_char – whether to open the augmentation method +of replacing random characters with their homophones in the +original texts. e.g. “这里一共有5种不同的数据增强方法” –> “这里一共有5种不同的濖据增强方法”

                                                                                                                                                • +
                                                                                                                                                • delete_random_char – whether to open the augmentation method of +deleting random characters from the original texts. e.g. +“这里一共有5种不同的数据增强方法” –> “这里一共有5种不同的数据增强”

                                                                                                                                                • +
                                                                                                                                                • swap_random_char – whether to open the augmentation method of +swapping random contiguous characters in the original texts. e.g. +“这里一共有5种不同的数据增强方法” –> “这里一共有5种不同的数据强增方法”

                                                                                                                                                • +
                                                                                                                                                • replace_equivalent_num – whether to open the augmentation method +of replacing random numbers with their equivalent representations +in the original texts. Notice: Only for numbers for now. e.g. +“这里一共有5种不同的数据增强方法” –> “这里一共有伍种不同的数据增强方法”

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(samples)[source]
                                                                                                                                                @@ -341,6 +535,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rMapper

                                                                                                                                                Mapper to normalize unicode punctuations to English punctuations in text samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(*args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -366,6 +574,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rMapper

                                                                                                                                                Mapper to remove bibliography at the end of documents in Latex samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(*args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -391,6 +613,23 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rMapper

                                                                                                                                                Mapper to remove comments in different kinds of documents.

                                                                                                                                                Only support ‘tex’ for now.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • doc_type – Type of document to remove comments.

                                                                                                                                                • +
                                                                                                                                                • inline – Whether to remove inline comments.

                                                                                                                                                • +
                                                                                                                                                • multiline – Whether to remove multiline comments.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -416,6 +655,22 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rMapper

                                                                                                                                                Mapper to remove headers at the beginning of documents in Latex samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(drop_no_head: bool = True, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • drop_no_head – whether to drop sample texts without +headers.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -440,6 +695,24 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper(min_len: PositiveInt = 1, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to remove long words within a specific range.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(min_len: PositiveInt = 1, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • min_len – The min mapper word length in this op, words +will be filtered if their length is below this parameter.

                                                                                                                                                • +
                                                                                                                                                • max_len – The max mapper word length in this op, words +will be filtered if their length exceeds this parameter.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -469,6 +742,22 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to clean specific chars in text samples.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • chars_to_remove – a list or a string including all +characters that need to be removed from text.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -495,6 +784,22 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e r +
                                                                                                                                                +__init__(min_col: from_2_to_20 = 2, max_col: from_2_to_20 = 20, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • min_col – The min number of columns of table to remove.

                                                                                                                                                • +
                                                                                                                                                • max_col – The max number of columns of table to remove.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +

                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -519,6 +824,23 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper(lang: str = 'en', tokenization: bool = False, substrings: List | None = None, *args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to remove words with incorrect substrings.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(lang: str = 'en', tokenization: bool = False, substrings: List | None = None, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • lang – sample in which language

                                                                                                                                                • +
                                                                                                                                                • tokenization – whether to use model to tokenize documents

                                                                                                                                                • +
                                                                                                                                                • substrings – The incorrect substrings in words.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -548,6 +870,21 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rclass data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper(lang: str = 'en', *args, **kwargs)[source]

                                                                                                                                                Bases: Mapper

                                                                                                                                                Mapper to split text samples to sentences.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(lang: str = 'en', *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • lang – split sentence of text in which language.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                @@ -575,6 +912,20 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . m a p p e rhttps://en.wikipedia.org/wiki/Whitespace_character

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(*args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(sample)[source]
                                                                                                                                                diff --git a/data_juicer.ops.selector.html b/data_juicer.ops.selector.html index 83ceeface..5b36f1683 100644 --- a/data_juicer.ops.selector.html +++ b/data_juicer.ops.selector.html @@ -106,6 +106,36 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . s e l e c t o rSelector

                                                                                                                                                Selector to select samples based on the sorted frequency of specified field.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(field_key: str = '', top_ratio: ClosedUnitInterval | None = None, topk: PositiveInt | None = None, reverse: bool = True, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • field_key – Selector based on the specified value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

                                                                                                                                                • +
                                                                                                                                                • top_ratio – Ratio of selected top specified field value, +samples will be selected if their specified field values are +within this parameter. When both topk and top_ratio are set, +the value corresponding to the smaller number of samples +will be applied.

                                                                                                                                                • +
                                                                                                                                                • topk – Number of selected top specified field value, +samples will be selected if their specified field values are +within this parameter. When both topk and top_ratio are set, +the value corresponding to the smaller number of samples +will be applied.

                                                                                                                                                • +
                                                                                                                                                • reverse – Determine the sorting rule, if reverse=True, +then sort in descending order.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(dataset)[source]
                                                                                                                                                @@ -131,6 +161,36 @@

                                                                                                                                                d a t a _ j u i c e r . o p s . s e l e c t o rSelector

                                                                                                                                                Selector to select top samples based on the sorted specified field value.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(field_key: str = '', top_ratio: ClosedUnitInterval | None = None, topk: PositiveInt | None = None, reverse: bool = True, *args, **kwargs)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • field_key – Selector based on the specified value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

                                                                                                                                                • +
                                                                                                                                                • top_ratio – Ratio of selected top samples, samples will be +selected if their specified field values are within this +parameter. When both topk and top_ratio are set, the value +corresponding to the smaller number of samples will be +applied.

                                                                                                                                                • +
                                                                                                                                                • topk – Number of selected top sample, samples will be +selected if their specified field values are within this +parameter. When both topk and top_ratio are set, the value +corresponding to the smaller number of samples will be +applied.

                                                                                                                                                • +
                                                                                                                                                • reverse – Determine the sorting rule, if reverse=True, +then sort in descending order.

                                                                                                                                                • +
                                                                                                                                                • args – extra args

                                                                                                                                                • +
                                                                                                                                                • kwargs – extra args

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                process(dataset)[source]
                                                                                                                                                diff --git a/data_juicer.utils.html b/data_juicer.utils.html index e565cb315..312e588d7 100644 --- a/data_juicer.utils.html +++ b/data_juicer.utils.html @@ -188,6 +188,21 @@

                                                                                                                                                d a t a _ j u i c e r . u t i l s +
                                                                                                                                                +__init__(ckpt_dir, original_process_list, num_proc=1)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • ckpt_dir – path to save and load checkpoint

                                                                                                                                                • +
                                                                                                                                                • original_process_list – process list in config

                                                                                                                                                • +
                                                                                                                                                • num_proc – number of process workers when saving dataset

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +

                                                                                                                                                +
                                                                                                                                                check_ckpt()[source]
                                                                                                                                                @@ -282,6 +297,18 @@

                                                                                                                                                d a t a _ j u i c e r . u t i l sobject

                                                                                                                                                This class is used to compress or decompress huggingface cache files using compression format algorithms.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(compressor_format: str = 'zstd')[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +

                                                                                                                                                compressor_format – compression format algorithms, +default zstd.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                cleanup_cache_files(ds)[source]
                                                                                                                                                @@ -345,6 +372,18 @@

                                                                                                                                                d a t a _ j u i c e r . u t i l sobject

                                                                                                                                                This class is used to compress or decompress a input file using compression format algorithms.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(compressor_format: str = 'zstd')[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +

                                                                                                                                                compressor_format – compression format algorithms, +default zstd.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                compress(input_path: Path | str, output_path: Path | str)[source]
                                                                                                                                                @@ -677,6 +716,11 @@

                                                                                                                                                d a t a _ j u i c e r . u t i l sclass data_juicer.utils.fingerprint_utils.Hasher[source]

                                                                                                                                                Bases: object

                                                                                                                                                Hasher that accepts python objets as inputs.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__()[source]
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                dispatch: Dict = {}
                                                                                                                                                @@ -737,6 +781,21 @@

                                                                                                                                                d a t a _ j u i c e r . u t i l sclass data_juicer.utils.logger_utils.StreamToLoguru(level='INFO', caller_names=('datasets', 'logging'))[source]

                                                                                                                                                Bases: object

                                                                                                                                                Stream object that redirects writes to a logger instance.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(level='INFO', caller_names=('datasets', 'logging'))[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +
                                                                                                                                                  +
                                                                                                                                                • level – log level string of loguru. Default value: “INFO”.

                                                                                                                                                • +
                                                                                                                                                • caller_names – caller names of redirected module. +Default value: (apex, pycocotools).

                                                                                                                                                • +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                flush()[source]
                                                                                                                                                @@ -962,6 +1021,17 @@

                                                                                                                                                d a t a _ j u i c e r . u t i l sobject

                                                                                                                                                This class is used to register some modules to registry by a repo name.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +__init__(name: str)[source]
                                                                                                                                                +

                                                                                                                                                Initialization method.

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                Parameters:
                                                                                                                                                +

                                                                                                                                                name – a registry repo name

                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                get(module_key)[source]
                                                                                                                                                diff --git a/genindex.html b/genindex.html index 9046c3037..03db3bd90 100644 --- a/genindex.html +++ b/genindex.html @@ -71,7 +71,8 @@

                                                                                                                                                Index

                                                                                                                                                - A + _ + | A | B | C | D @@ -95,6 +96,156 @@

                                                                                                                                                Index

                                                                                                                                                | Z
                                                                                                                                                +

                                                                                                                                                _

                                                                                                                                                + + +
                                                                                                                                                +

                                                                                                                                                A

                                                                                                                                                  diff --git a/objects.inv b/objects.inv index a77bf52fb..f9adf6cef 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/searchindex.js b/searchindex.js index 912e30c48..f1a093fad 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["d a t a _ j u i c e r", "d a t a _ j u i c e r . a n a l y s i s", "d a t a _ j u i c e r . c o n f i g", "d a t a _ j u i c e r . c o r e", "d a t a _ j u i c e r . f o r m a t", "d a t a _ j u i c e r . o p s", "d a t a _ j u i c e r . o p s . c o m m o n", "d a t a _ j u i c e r . o p s . d e d u p l i c a t o r", "d a t a _ j u i c e r . o p s . f i l t e r", "d a t a _ j u i c e r . o p s . m a p p e r", "d a t a _ j u i c e r . o p s . s e l e c t o r", "d a t a _ j u i c e r . t o o l s", "d a t a _ j u i c e r . u t i l s", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"n": [0, 5, 8, 13, 14], "l": [0, 5, 13, 14], "y": [0, 6, 13, 14], "": [0, 4, 14], "data_juic": 0, "analysi": [0, 3, 13, 14], "column_wise_analysi": [0, 13, 14], "columnwiseanalysi": [0, 1, 3, 13, 14], "analys": [0, 1, 2, 13, 14], "draw_box": [0, 1], "draw_hist": [0, 1], "get_row_col": [0, 1, 13, 14], "diversity_analysi": [0, 13, 14], "diversityanalysi": [0, 1, 13, 14], "comput": [0, 1, 3, 5, 6, 7, 8, 12], "find_root_verb_and_its_dobj": [0, 1, 13, 14], "find_root_verb_and_its_dobj_in_str": [0, 1, 13, 14], "get_divers": [0, 1, 13, 14], "overall_analysi": [0, 13, 14], "overallanalysi": [0, 1, 3, 13, 14], "o": [0, 12, 13, 14], "f": [0, 3, 5, 13, 14], "g": [0, 3, 4, 13, 14], "config": [0, 3, 5, 12, 13, 14], "config_backup": [0, 2, 13, 14], "display_config": [0, 2, 13, 14], "init_config": [0, 2, 13, 14], "init_setup_from_cfg": [0, 2, 13, 14], "sort_op_by_types_and_nam": [0, 2, 13, 14], "core": [0, 13, 14], "run": [0, 3], "data": [0, 1, 4, 7, 14], "nesteddataset": [0, 3, 13, 14], "add_column": [0, 3], "cleanup_cache_fil": [0, 3, 12], "filter": [0, 3, 4, 5, 13, 14], "from_dict": [0, 3], "map": [0, 3, 4], "remove_column": [0, 3], "select": [0, 3, 4, 5, 7, 10], "select_column": [0, 3], "nesteddatasetdict": [0, 3, 13, 14], "nestedquerydict": [0, 3, 13, 14], "nested_obj_factori": [0, 3, 13, 14], "nested_queri": [0, 3, 13, 14], "wrap_func_with_nested_access": [0, 3, 13, 14], "executor": [0, 2, 13, 14], "export": [0, 4, 13, 14], "gib": [0, 3], "kib": [0, 3], "mib": [0, 3], "tib": [0, 3], "to_jsonl": [0, 3], "to_parquet": [0, 3], "ray_executor": [0, 13, 14], "rayexecutor": [0, 3, 13, 14], "tracer": [0, 5, 7, 13, 14], "trace_batch_mapp": [0, 3], "trace_dedupl": [0, 3], "trace_filt": [0, 3], "trace_mapp": [0, 3], "m": [0, 5, 13, 14], "format": [0, 3, 12, 13, 14], "csv_formatt": [0, 13, 14], "csvformatt": [0, 4, 13, 14], "suffix": [0, 4, 8, 12], "formatt": [0, 13, 14], "baseformatt": [0, 4, 13, 14], "load_dataset": [0, 4], "localformatt": [0, 4, 13, 14], "remoteformatt": [0, 4, 13, 14], "add_suffix": [0, 4, 13, 14], "load_formatt": [0, 4, 13, 14], "unify_format": [0, 4, 13, 14], "json_formatt": [0, 13, 14], "jsonformatt": [0, 4, 13, 14], "load": [0, 1, 3, 12, 13, 14], "mixture_formatt": [0, 13, 14], "mixtureformatt": [0, 4, 13, 14], "parquet_formatt": [0, 13, 14], "parquetformatt": [0, 4, 13, 14], "text_formatt": [0, 13, 14], "textformatt": [0, 4, 12, 13, 14], "extract_txt_from_docx": [0, 4, 13, 14], "extract_txt_from_pdf": [0, 4, 13, 14], "tsv_formatt": [0, 13, 14], "tsvformatt": [0, 4, 13, 14], "p": [0, 13, 14], "op": [0, 2, 3, 12, 13, 14], "common": [0, 3, 5, 13, 14], "helper_func": [0, 5, 13, 14], "unionfind": [0, 5, 6], "get_sentences_from_docu": [0, 5, 6], "get_words_from_docu": [0, 5, 6], "merge_on_whitespace_tab_newlin": [0, 5, 6], "split_on_newline_tab_whitespac": [0, 5, 6], "split_on_whitespac": [0, 5, 6], "strip": [0, 5, 6], "words_augment": [0, 5, 6], "words_refin": [0, 5, 6], "special_charact": [0, 5, 13, 14], "dedupl": [0, 3, 5, 13, 14], "document_dedupl": [0, 5, 13, 14], "documentdedupl": [0, 5, 7], "document_minhash_dedupl": [0, 5, 13, 14], "documentminhashdedupl": [0, 5, 7], "optimal_param": [0, 5, 7], "sha1_hash32": [0, 5, 7], "document_simhash_dedupl": [0, 5, 13, 14], "documentsimhashdedupl": [0, 5, 7], "local_num_differing_bit": [0, 5, 7], "num_differing_bits_selector": [0, 5, 7], "alphanumeric_filt": [0, 5, 13, 14], "alphanumericfilt": [0, 5, 8], "average_line_length_filt": [0, 5, 13, 14], "averagelinelengthfilt": [0, 5, 8], "character_repetition_filt": [0, 5, 13, 14], "characterrepetitionfilt": [0, 5, 8], "flagged_words_filt": [0, 5, 13, 14], "flaggedwordfilt": [0, 5, 8], "language_id_score_filt": [0, 5, 13, 14], "languageidscorefilt": [0, 5, 8], "maximum_line_length_filt": [0, 5, 13, 14], "maximumlinelengthfilt": [0, 5, 8], "perplexity_filt": [0, 5, 13, 14], "perplexityfilt": [0, 5, 8], "special_characters_filt": [0, 5, 13, 14], "specialcharactersfilt": [0, 5, 8], "specified_field_filt": [0, 5, 13, 14], "specifiedfieldfilt": [0, 5, 8], "specified_numeric_field_filt": [0, 5, 13, 14], "specifiednumericfieldfilt": [0, 5, 8], "is_numb": [0, 5, 8], "stopwords_filt": [0, 5, 13, 14], "stopwordsfilt": [0, 5, 8], "suffix_filt": [0, 5, 13, 14], "suffixfilt": [0, 5, 8], "text_length_filt": [0, 5, 13, 14], "textlengthfilt": [0, 5, 8], "token_num_filt": [0, 5, 13, 14], "tokennumfilt": [0, 5, 8], "word_num_filt": [0, 5, 13, 14], "wordnumfilt": [0, 5, 8], "word_repetition_filt": [0, 5, 13, 14], "wordrepetitionfilt": [0, 5, 8], "mapper": [0, 3, 5, 13, 14], "clean_copyright_mapp": [0, 5, 13, 14], "cleancopyrightmapp": [0, 5, 9], "clean_email_mapp": [0, 5, 13, 14], "cleanemailmapp": [0, 5, 9], "clean_html_mapp": [0, 5, 13, 14], "cleanhtmlmapp": [0, 5, 9], "clean_ip_mapp": [0, 5, 13, 14], "cleanipmapp": [0, 5, 9], "clean_links_mapp": [0, 5, 13, 14], "cleanlinksmapp": [0, 5, 9], "expand_macro_mapp": [0, 5, 13, 14], "expandmacromapp": [0, 5, 9], "fix_unicode_mapp": [0, 5, 13, 14], "fixunicodemapp": [0, 5, 9], "nlpaug_en_mapp": [0, 5, 13, 14], "nlpaugenmapp": [0, 5, 9], "nlpcda_zh_mapp": [0, 5, 13, 14], "nlpcdazhmapp": [0, 5, 9], "punctuation_normalization_mapp": [0, 5, 13, 14], "punctuationnormalizationmapp": [0, 5, 9], "remove_bibliography_mapp": [0, 5, 13, 14], "removebibliographymapp": [0, 5, 9], "remove_comments_mapp": [0, 5, 13, 14], "removecommentsmapp": [0, 5, 9], "remove_header_mapp": [0, 5, 13, 14], "removeheadermapp": [0, 5, 9], "remove_long_words_mapp": [0, 5, 13, 14], "removelongwordsmapp": [0, 5, 9], "remove_specific_chars_mapp": [0, 5, 13, 14], "removespecificcharsmapp": [0, 5, 9], "remove_table_text_mapp": [0, 5, 13, 14], "removetabletextmapp": [0, 5, 9], "remove_words_with_incorrect_substrings_mapp": [0, 5, 13, 14], "removewordswithincorrectsubstringsmapp": [0, 5, 9], "sentence_split_mapp": [0, 5, 13, 14], "sentencesplitmapp": [0, 5, 9], "whitespace_normalization_mapp": [0, 5, 13, 14], "whitespacenormalizationmapp": [0, 5, 9], "selector": [0, 5, 13, 14], "frequency_specified_field_selector": [0, 5, 13, 14], "frequencyspecifiedfieldselector": [0, 5, 10], "topk_specified_field_selector": [0, 5, 13, 14], "topkspecifiedfieldselector": [0, 5, 10], "to_numb": [0, 5, 10], "base_op": [0, 13, 14], "compute_hash": [0, 5, 7], "process": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12], "compute_stat": [0, 5, 8], "is_batched_op": [0, 5], "load_op": [0, 5, 13, 14], "op_fus": [0, 13, 14], "fusedfilt": [0, 5, 13, 14], "fuse_filter_group": [0, 5, 13, 14], "fuse_oper": [0, 5, 13, 14], "util": [0, 13, 14], "asset_util": [0, 13, 14], "load_words_asset": [0, 12, 13, 14], "cache_util": [0, 13, 14], "ckpt_util": [0, 13, 14], "checkpointmanag": [0, 12, 13, 14], "check_ckpt": [0, 12], "check_ops_to_skip": [0, 12], "get_left_process_list": [0, 12], "load_ckpt": [0, 12], "record": [0, 12], "save_ckpt": [0, 12], "compress": [0, 3, 13, 14], "basecompressor": [0, 12, 13, 14], "cachecompressmanag": [0, 12, 13, 14], "decompress": [0, 12, 13, 14], "format_cache_file_nam": [0, 12], "compressmanag": [0, 12, 13, 14], "compressionoff": [0, 12, 13, 14], "compressor": [0, 12, 13, 14], "extractor": [0, 12, 13, 14], "extract": [0, 3, 4, 12], "filelock": [0, 12, 13, 14], "gzipcompressor": [0, 12, 13, 14], "lz4compressor": [0, 12, 13, 14], "zstdcompressor": [0, 12, 13, 14], "cleanup_compressed_cache_fil": [0, 12, 13, 14], "constant": [0, 13, 14], "field": [0, 4, 8, 10, 12, 13, 14], "context": [0, 5, 8, 12], "meta": [0, 3, 4, 12], "stat": [0, 1, 3, 5, 8, 12], "hashkei": [0, 12, 13, 14], "hash": [0, 5, 7, 12], "minhash": [0, 7, 12], "simhash": [0, 7, 12], "intervar": [0, 12, 13, 14], "line": [0, 1, 2, 8, 12], "refined_word": [0, 12], "word": [0, 6, 8, 9, 12], "statskei": [0, 12, 13, 14], "alnum_ratio": [0, 12], "alpha_token_ratio": [0, 12], "avg_line_length": [0, 12], "char_rep_ratio": [0, 12], "flagged_words_ratio": [0, 12], "lang": [0, 8, 9, 12], "lang_scor": [0, 12], "max_line_length": [0, 12], "num_token": [0, 12], "num_word": [0, 12], "perplex": [0, 8, 12], "special_char_ratio": [0, 12], "stopwords_ratio": [0, 12], "text_len": [0, 12], "word_rep_ratio": [0, 12], "file_util": [0, 13, 14], "find_files_with_suffix": [0, 12, 13, 14], "is_absolute_path": [0, 12, 13, 14], "fingerprint_util": [0, 13, 14], "hasher": [0, 12, 13, 14], "dispatch": [0, 12], "hash_byt": [0, 12], "hash_default": [0, 12], "hexdigest": [0, 12], "updat": [0, 2, 12], "generate_fingerprint": [0, 12, 13, 14], "update_fingerprint": [0, 12, 13, 14], "logger_util": [0, 13, 14], "hiddenprint": [0, 12, 13, 14], "streamtologuru": [0, 12, 13, 14], "flush": [0, 12], "write": [0, 12], "get_caller_nam": [0, 12, 13, 14], "get_log_file_path": [0, 12, 13, 14], "redirect_sys_output": [0, 12, 13, 14], "setup_logg": [0, 12, 13, 14], "model_util": [0, 13, 14], "check_model": [0, 12, 13, 14], "get_model": [0, 12, 13, 14], "prepare_diversity_model": [0, 12, 13, 14], "prepare_fasttext_model": [0, 12, 13, 14], "prepare_huggingface_token": [0, 12, 13, 14], "prepare_kenlm_model": [0, 12, 13, 14], "prepare_model": [0, 12, 13, 14], "prepare_nltk_model": [0, 12, 13, 14], "prepare_sentencepiece_model": [0, 12, 13, 14], "registri": [0, 13, 14], "get": [0, 1, 4, 6, 12], "list": [0, 1, 2, 3, 4, 5, 6, 8, 9, 12], "modul": [0, 2, 12, 13], "name": [0, 1, 2, 3, 4, 5, 12], "register_modul": [0, 12], "class": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12], "dataset": [1, 3, 4, 5, 7, 10, 12], "output_path": [1, 12], "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 9, 10, 12], "save_stats_in_one_fil": 1, "true": [1, 3, 5, 6, 7, 8, 9, 10, 12], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], "base": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12], "object": [1, 2, 3, 4, 5, 6, 12], "appli": [1, 3], "each": [1, 2, 5], "column": [1, 3, 9], "respect": [1, 7], "show_percentil": 1, "fals": [1, 4, 5, 6, 7, 8, 9, 12], "show": [1, 3], "draw": 1, "figur": [1, 3], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], "whether": [1, 4, 5, 6, 8, 12], "percentil": 1, "sub": [1, 2, 6, 12], "If": [1, 8, 12], "sever": 1, "red": 1, "indic": 1, "quantil": 1, "distribut": [1, 3], "singl": [1, 5], "window": 1, "after": [1, 2, 3, 4, 6, 7], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], "ax": 1, "save_path": 1, "box": 1, "plot": 1, "path": [1, 3, 4, 12], "save": [1, 4, 12], "overal": 1, "result": [1, 3], "includ": [1, 5], "inform": [1, 5, 8], "histogram": 1, "total_num": 1, "factor": 1, "2": [1, 3, 6, 8, 9], "given": [1, 3], "total": [1, 8], "number": [1, 3, 4, 5, 7, 8, 9, 12], "best": 1, "row": [1, 7], "thi": [1, 3, 5, 6, 7, 8, 12], "function": [1, 3, 6, 7], "need": [1, 6, 12], "when": [1, 3, 4, 5, 7, 12], "we": [1, 7, 12], "store": [1, 3, 4, 5, 7, 8, 12], "all": [1, 3, 6, 12], "one": [1, 2, 6, 12], "imag": 1, "type": [1, 2, 4, 7, 12], "In": 1, "default": [1, 2, 4, 12], "which": [1, 3, 5, 8, 12], "mean": [1, 12], "ar": [1, 3, 6, 12], "grid": 1, "lang_or_model": 1, "en": [1, 8, 9, 12], "divers": [1, 12], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "an": [1, 3, 4, 5, 7, 12], "column_nam": 1, "text": [1, 4, 8, 9], "postproc_func": 1, "postproc_kwarg": 1, "whole": 1, "model": [1, 6, 12], "specif": [1, 3, 8, 9, 12], "languag": [1, 8, 12], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 12], "argument": [1, 3, 5], "lexic": 1, "tree": 1, "tree_root": 1, "find": [1, 3, 5, 6, 12], "verb": 1, "its": [1, 3, 4, 5, 12], "closest": 1, "root": 1, "valid": [1, 12], "nlp": 1, "first_sent": 1, "input": [1, 3, 4, 5, 7, 8, 10, 12], "string": [1, 12], "first": [1, 3, 6], "sentenc": [1, 6, 9], "onli": [1, 3, 9, 12], "matter": 1, "over": 1, "top_k_verb": 1, "20": [1, 9], "top_k_noun": 1, "4": [1, 7], "kwarg": [1, 3, 4, 7, 8, 9, 10, 12], "keep": [1, 5, 8, 12], "largest": 1, "group": [1, 5, 6], "noun": 1, "extra": [1, 2, 3, 12], "arg": [1, 2, 3, 4, 5, 7, 8, 9, 10, 12], "std": 1, "etc": [1, 3, 12], "describ": 1, "method": [1, 3, 6, 7], "panda": 1, "cfg": [2, 3, 4], "initi": 2, "jsonargpars": 2, "parser": 2, "pars": 2, "from": [2, 3, 4, 5, 6, 7, 9, 12], "posix": 2, "style": 2, "command": [2, 4], "file": [2, 3, 4, 5, 12], "yaml": 2, "json": [2, 3, 4], "jsonnet": 2, "superset": 2, "environ": [2, 12], "variabl": [2, 5], "hard": 2, "code": [2, 9], "param": [2, 6, 12], "conifg": 2, "defaut": 2, "global": [2, 4], "do": 2, "some": [2, 7, 12], "setup": [2, 12], "task": 2, "creat": 2, "work": [2, 3], "directori": [2, 3, 4, 12], "log": [2, 12], "cach": [2, 3, 8, 12], "checkpoint": [2, 3, 12], "temp_dir": 2, "tempfil": 2, "origin": [2, 4, 5, 7, 12], "op_name_class": 2, "split": [2, 6, 9], "item": [2, 3, 5], "sort": [2, 10], "them": [2, 4, 12], "concat": 2, "togeth": 2, "pair": [2, 3, 5, 7], "op_nam": [2, 3, 12], "op_class": 2, "It": 3, "multipl": [3, 4, 6, 12], "gener": [3, 12], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": 3, "load_data_np": 3, "pipelin": 3, "worker": 3, "karg": 3, "enhanc": 3, "huggingfac": [3, 4, 12], "usabl": 3, "effici": 3, "overrid": [3, 12], "add": [3, 4], "func": 3, "can": [3, 9], "access": 3, "nest": 3, "manner": 3, "clear": 3, "raw": 3, "call": [3, 7], "most": 3, "oper": [3, 5, 12], "classmethod": [3, 12], "from_xx": 3, "constructor": 3, "construct": 3, "remov": [3, 6, 9, 12], "datasetdict": [3, 4], "dict": [3, 12], "obj": 3, "wrap": 3, "root_obj": 3, "kei": [3, 4, 12], "check": [3, 4, 12], "flatten": 3, "layer": 3, "queri": 3, "date": 3, "befor": [3, 12], "conduct": [3, 4], "actual": 3, "ones": 3, "unifi": [3, 4], "order": 3, "export_path": 3, "export_shard_s": 3, "0": [3, 4, 5, 7, 8, 12], "export_in_parallel": 3, "num_proc": [3, 4, 12], "1": [3, 4, 9, 12], "export_d": 3, "export_stat": 3, "The": [3, 4, 5, 7], "1073741824": 3, "1024": 3, "1048576": 3, "1099511627776": 3, "static": [3, 12], "jsonl": [3, 4], "target": [3, 4, 8, 12], "parquet": [3, 4], "rai": 3, "experiment": 3, "juicer": 3, "cluster": 3, "support": [3, 9, 12], "now": [3, 6, 9], "advanc": 3, "work_dir": 3, "show_num": [3, 5, 7], "10": [3, 8], "trace": [3, 5, 7], "chang": [3, 7, 12], "comparison": 3, "str": [3, 4, 5, 6, 7, 8, 9, 10, 12], "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": [3, 12], "batchmapp": 3, "mainli": 3, "new": [3, 4, 12], "augment": [3, 6, 9], "dup_pair": 3, "duplic": [3, 5, 7], "differ": [3, 4, 6, 7, 9], "other": 3, "two": [3, 7], "embed": 3, "independ": 3, "obtain": [3, 6], "due": [3, 7], "modif": [3, 4], "dataset_path": 4, "csv": 4, "tupl": [4, 7, 8, 12], "local": [4, 7], "int": [4, 7, 9, 12], "global_cfg": 4, "consequ": 4, "repositori": 4, "hub": 4, "featur": 4, "appropri": 4, "read": 4, "intern": 4, "follow": 4, "out": 4, "those": 4, "empti": 4, "sinc": [4, 6], "mai": 4, "modifi": 4, "unified_format_dataset": 4, "zst": 4, "mixtur": 4, "option": [4, 12], "weight": [4, 7], "accord": [4, 5, 7], "specifi": [4, 6, 8, 10, 12], "info": [4, 12], "mix": 4, "randomli": 4, "everi": 4, "merg": [4, 6], "datasset": 4, "txt": [4, 12], "pdf": 4, "cpp": 4, "docx": 4, "md": [4, 12], "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "h": 4, "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "py": 4, "rb": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": 4, "rst": 4, "smali": 4, "fn": 4, "tgt_path": 4, "tsv": 4, "union": [5, 6], "should_keep_long_word": [5, 9], "should_keep_word_with_incorrect_substr": [5, 9], "valu": [5, 7, 8, 10, 12], "For": [5, 7, 8, 9], "doc": [5, 7], "level": [5, 6, 7, 8, 9, 12], "open": [5, 7], "metric": [5, 8], "decid": [5, 8, 12], "intermedi": [5, 8], "var": [5, 8], "temporarili": [5, 8, 12], "boolean": [5, 8], "process_list": 5, "A": 5, "fuse": 5, "share": 5, "same": [5, 12], "instanc": [5, 12], "fused_filt": 5, "original_filter_group": 5, "definit": [5, 9], "correspond": [5, 12], "x": 6, "document": [6, 7, 9], "model_func": 6, "splite": 6, "separ": 6, "token_func": 6, "new_lin": 6, "tab": 6, "ratio": [6, 8], "like": [6, 9, 12], "stopword": [6, 8], "token": [6, 7, 8, 9, 12], "invert": 6, "concaten": 6, "setenc": 6, "also": 6, "space": [6, 7], "tag": 6, "strip_charact": 6, "wai": 6, "faster": 6, "than": [6, 8], "set": 6, "instead": 6, "contain": [6, 12], "lot": 6, "element": 6, "emoji": 6, "charact": 6, "uesd": 6, "group_siz": 6, "join_char": 6, "especi": 6, "chines": [6, 9], "without": 6, "between": [6, 7, 12], "vietnames": 6, "syllabl": 6, "size": 6, "ad": 6, "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": 6, "non": 6, "revers": [6, 10], "special": [6, 8], "convert": 6, "lower": 6, "case": 6, "lowercas": [6, 7], "char": [6, 8, 9], "bool": [7, 8, 9, 10, 12], "ignore_non_charact": 7, "exact": 7, "match": 7, "md5": 7, "window_s": 7, "positiveint": [7, 8, 9, 10], "5": [7, 8], "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "closedunitinterv": [7, 8, 10], "7": 7, "num_band": 7, "num_rows_per_band": 7, "minhashlsh": 7, "byte": [7, 12], "so": [7, 12], "thei": 7, "won": 7, "kept": 7, "final": 7, "threshold": 7, "float": [7, 8], "num_perm": 7, "false_positive_weight": 7, "false_negative_weight": 7, "optim": 7, "minim": 7, "sum": 7, "probabl": 7, "posit": 7, "neg": 7, "taken": 7, "datasketch": 7, "similar": 7, "permut": 7, "b": 7, "band": 7, "per": 7, "directli": 7, "packag": 7, "avoid": [7, 12], "depend": 7, "6": [7, 8], "num_block": 7, "hamming_dist": 7, "hash_a": 7, "hash_b": 7, "implement": 7, "calcul": 7, "bit": [7, 12], "integ": 7, "num_differing_bit": 7, "python": [7, 12], "version": [7, 12], "instal": 7, "3": [7, 8], "9": 7, "librari": [7, 9], "cannot": 7, "compil": 7, "correctli": 7, "cython": 7, "fix": [7, 9], "incompat": 7, "recursionerror": 7, "occur": 7, "sometim": [7, 12], "our": 7, "otherwis": [7, 12], "avail": [7, 12], "min_ratio": 8, "25": 8, "max_ratio": 8, "positivefloat": 8, "9223372036854775807": [8, 9], "alphabet": 8, "numer": 8, "within": [8, 9, 12], "rang": [8, 9, 12], "min_len": [8, 9], "max_len": [8, 9], "averag": 8, "length": 8, "rep_len": 8, "gram": 8, "repetit": 8, "045": 8, "flagged_words_dir": 8, "home": 8, "runner": 8, "asset": [8, 12], "flag": 8, "less": 8, "max": 8, "min_scor": 8, "8": 8, "confid": 8, "score": 8, "larger": 8, "min": 8, "maximum": 8, "max_ppl": 8, "1500": 8, "field_kei": [8, 10], "target_valu": 8, "min_valu": 8, "max_valu": 8, "stopwords_dir": 8, "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "clean": [9, 12], "copyright": 9, "comment": 9, "begin": [9, 12], "email": 9, "ipv4": 9, "ipv6": 9, "address": 9, "link": [9, 12], "http": 9, "ftp": 9, "expand": 9, "macro": 9, "bodi": 9, "latex": 9, "unicod": 9, "error": 9, "sequenti": 9, "aug_num": 9, "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "delete_random_char": 9, "swap_random_char": 9, "insert_random_char": 9, "simpli": 9, "english": 9, "nlpaug": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "replace_equivalent_num": 9, "nlpcda": 9, "normal": 9, "punctuat": 9, "bibliographi": 9, "end": [9, 12], "doc_typ": 9, "inlin": 9, "multilin": 9, "kind": 9, "drop_no_head": 9, "header": 9, "long": 9, "chars_to_remov": 9, "min_col": 9, "from_2_to_20": 9, "max_col": 9, "regular": 9, "express": 9, "substr": 9, "incorrect": 9, "whitespac": 9, "0x20": 9, "found": [9, 12], "here": [9, 12], "wikipedia": 9, "org": 9, "wiki": 9, "whitespace_charact": 9, "top_ratio": 10, "topk": 10, "frequenc": 10, "top": 10, "words_dir": 12, "words_typ": 12, "download": 12, "asset_link": 12, "team": 12, "whose": 12, "ckpt_dir": 12, "original_process_list": 12, "latest": 12, "manag": 12, "rerun": 12, "reload": 12, "skip": 12, "ani": 12, "els": 12, "prefix": 12, "part": 12, "start": 12, "scratch": 12, "somm": 12, "left": 12, "unchang": 12, "op_arg": 12, "dump": 12, "abc": 12, "abstract": 12, "input_path": 12, "output": 12, "uncompress": 12, "compressor_format": 12, "zstd": 12, "algorithm": 12, "up": 12, "prev_d": 12, "this_d": 12, "fingerprint": 12, "previou": 12, "current": 12, "There": 12, "might": 12, "overlap": 12, "must": 12, "again": 12, "should": 12, "fingerprintd": 12, "accept": 12, "cache_file_nam": 12, "replac": 12, "rank": 12, "defin": 12, "turn": 12, "off": 12, "see": 12, "gzip": 12, "lz4": 12, "content": 12, "extractor_format": 12, "lock_fil": 12, "timeout": 12, "max_filename_length": 12, "unixfilelock": 12, "lock": 12, "compresss": 12, "automat": 12, "__dj__context__": 12, "__dj__meta__": 12, "__dj__stats__": 12, "__dj__suffix__": 12, "__dj__hash": 12, "__dj__minhash": 12, "__dj__simhash": 12, "__dj__line": 12, "__dj__refined_word": 12, "__dj__word": 12, "travers": 12, "absolut": 12, "rel": 12, "objet": 12, "dill": 12, "serial": 12, "failur": 12, "variou": 12, "transform": 12, "transform_arg": 12, "combin": 12, "hide": 12, "caller_nam": 12, "stream": 12, "redirect": 12, "logger": 12, "buf": 12, "depth": 12, "caller": 12, "locat": 12, "log_level": 12, "stdout": 12, "stderr": 12, "loguru": 12, "save_dir": 12, "distributed_rank": 12, "filenam": 12, "mode": 12, "train": 12, "test": 12, "devic": 12, "multi": 12, "gpu": 12, "append": 12, "system": 12, "model_nam": 12, "forc": 12, "exist": 12, "model_path": 12, "full": 12, "forcefulli": 12, "mayb": 12, "incomplet": 12, "reason": 12, "model_kei": 12, "model_typ": 12, "sentencepiec": 12, "model_zoo": 12, "tokenzi": 12, "prepar": 12, "zh": 12, "fasttext": 12, "tokenizer_nam": 12, "kenlm": 12, "syntax": 12, "render": 12, "nltk": 12, "punkt": 12, "regist": 12, "repo": 12, "module_kei": 12, "properti": 12, "module_nam": 12, "module_cl": 12, "modulenam": 12, "exampl": 12, "pass": 12, "textformatter2": 12, "text_formatter2": 12, "d": [13, 14], "t": [13, 14], "_": [13, 14], "j": [13, 14], "u": [13, 14], "i": [13, 14], "c": [13, 14], "e": [13, 14], "r": [13, 14], "index": 13, "search": 13, "page": 13}, "objects": {"data_juicer.analysis": [[1, 0, 0, "-", "column_wise_analysis"], [1, 0, 0, "-", "diversity_analysis"], [1, 0, 0, "-", "overall_analysis"]], "data_juicer.analysis.column_wise_analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 3, 1, "", "get_row_col"]], "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "analyse"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.diversity_analysis": [[1, 1, 1, "", "DiversityAnalysis"], [1, 3, 1, "", "find_root_verb_and_its_dobj"], [1, 3, 1, "", "find_root_verb_and_its_dobj_in_string"], [1, 3, 1, "", "get_diversity"]], "data_juicer.analysis.diversity_analysis.DiversityAnalysis": [[1, 2, 1, "", "analyse"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.overall_analysis": [[1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.overall_analysis.OverallAnalysis": [[1, 2, 1, "", "analyse"]], "data_juicer.config": [[2, 0, 0, "-", "config"]], "data_juicer.config.config": [[2, 3, 1, "", "config_backup"], [2, 3, 1, "", "display_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "init_setup_from_cfg"], [2, 3, 1, "", "sort_op_by_types_and_names"]], "data_juicer.core": [[3, 0, 0, "-", "analyser"], [3, 0, 0, "-", "data"], [3, 0, 0, "-", "executor"], [3, 0, 0, "-", "exporter"], [3, 0, 0, "-", "ray_executor"], [3, 0, 0, "-", "tracer"]], "data_juicer.core.analyser": [[3, 1, 1, "", "Analyser"]], "data_juicer.core.analyser.Analyser": [[3, 2, 1, "", "run"]], "data_juicer.core.data": [[3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "NestedDatasetDict"], [3, 1, 1, "", "NestedQueryDict"], [3, 3, 1, "", "nested_obj_factory"], [3, 3, 1, "", "nested_query"], [3, 3, 1, "", "wrap_func_with_nested_access"]], "data_juicer.core.data.NestedDataset": [[3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "map"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.data.NestedDatasetDict": [[3, 2, 1, "", "map"]], "data_juicer.core.executor": [[3, 1, 1, "", "Executor"]], "data_juicer.core.executor.Executor": [[3, 2, 1, "", "run"]], "data_juicer.core.exporter": [[3, 1, 1, "", "Exporter"]], "data_juicer.core.exporter.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "export"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.ray_executor": [[3, 1, 1, "", "RayExecutor"]], "data_juicer.core.ray_executor.RayExecutor": [[3, 2, 1, "", "run"]], "data_juicer.core.tracer": [[3, 1, 1, "", "Tracer"]], "data_juicer.core.tracer.Tracer": [[3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 0, 0, "-", "csv_formatter"], [4, 0, 0, "-", "formatter"], [4, 0, 0, "-", "json_formatter"], [4, 0, 0, "-", "load"], [4, 0, 0, "-", "mixture_formatter"], [4, 0, 0, "-", "parquet_formatter"], [4, 0, 0, "-", "text_formatter"], [4, 0, 0, "-", "tsv_formatter"]], "data_juicer.format.csv_formatter": [[4, 1, 1, "", "CsvFormatter"]], "data_juicer.format.csv_formatter.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"]], "data_juicer.format.formatter": [[4, 1, 1, "", "BaseFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 3, 1, "", "add_suffixes"], [4, 3, 1, "", "load_formatter"], [4, 3, 1, "", "unify_format"]], "data_juicer.format.formatter.BaseFormatter": [[4, 2, 1, "", "load_dataset"]], "data_juicer.format.formatter.LocalFormatter": [[4, 2, 1, "", "load_dataset"]], "data_juicer.format.formatter.RemoteFormatter": [[4, 2, 1, "", "load_dataset"]], "data_juicer.format.json_formatter": [[4, 1, 1, "", "JsonFormatter"]], "data_juicer.format.json_formatter.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"]], "data_juicer.format.load": [[4, 3, 1, "", "load_formatter"]], "data_juicer.format.mixture_formatter": [[4, 1, 1, "", "MixtureFormatter"]], "data_juicer.format.mixture_formatter.MixtureFormatter": [[4, 2, 1, "", "load_dataset"]], "data_juicer.format.parquet_formatter": [[4, 1, 1, "", "ParquetFormatter"]], "data_juicer.format.parquet_formatter.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"]], "data_juicer.format.text_formatter": [[4, 1, 1, "", "TextFormatter"], [4, 3, 1, "", "extract_txt_from_docx"], [4, 3, 1, "", "extract_txt_from_pdf"]], "data_juicer.format.text_formatter.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.tsv_formatter": [[4, 1, 1, "", "TsvFormatter"]], "data_juicer.format.tsv_formatter.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"]], "data_juicer.ops": [[5, 0, 0, "-", "base_op"], [5, 0, 0, "-", "load"], [5, 0, 0, "-", "op_fusion"]], "data_juicer.ops.base_op": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"]], "data_juicer.ops.base_op.Deduplicator": [[5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"]], "data_juicer.ops.base_op.Filter": [[5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"]], "data_juicer.ops.base_op.Mapper": [[5, 2, 1, "", "is_batched_op"], [5, 2, 1, "", "process"]], "data_juicer.ops.base_op.Selector": [[5, 2, 1, "", "process"]], "data_juicer.ops.common": [[6, 0, 0, "-", "helper_func"], [6, 0, 0, "-", "special_characters"]], "data_juicer.ops.common.helper_func": [[6, 1, 1, "", "UnionFind"], [6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.common.helper_func.UnionFind": [[6, 2, 1, "", "find"], [6, 2, 1, "", "union"]], "data_juicer.ops.deduplicator": [[7, 0, 0, "-", "document_deduplicator"], [7, 0, 0, "-", "document_minhash_deduplicator"], [7, 0, 0, "-", "document_simhash_deduplicator"]], "data_juicer.ops.deduplicator.document_deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"]], "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.document_minhash_deduplicator": [[7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 3, 1, "", "optimal_param"], [7, 3, 1, "", "sha1_hash32"]], "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.document_simhash_deduplicator": [[7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 3, 1, "", "local_num_differing_bits"], [7, 3, 1, "", "num_differing_bits_selector"]], "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 0, 0, "-", "alphanumeric_filter"], [8, 0, 0, "-", "average_line_length_filter"], [8, 0, 0, "-", "character_repetition_filter"], [8, 0, 0, "-", "flagged_words_filter"], [8, 0, 0, "-", "language_id_score_filter"], [8, 0, 0, "-", "maximum_line_length_filter"], [8, 0, 0, "-", "perplexity_filter"], [8, 0, 0, "-", "special_characters_filter"], [8, 0, 0, "-", "specified_field_filter"], [8, 0, 0, "-", "specified_numeric_field_filter"], [8, 0, 0, "-", "stopwords_filter"], [8, 0, 0, "-", "suffix_filter"], [8, 0, 0, "-", "text_length_filter"], [8, 0, 0, "-", "token_num_filter"], [8, 0, 0, "-", "word_num_filter"], [8, 0, 0, "-", "word_repetition_filter"]], "data_juicer.ops.filter.alphanumeric_filter": [[8, 1, 1, "", "AlphanumericFilter"]], "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.average_line_length_filter": [[8, 1, 1, "", "AverageLineLengthFilter"]], "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.character_repetition_filter": [[8, 1, 1, "", "CharacterRepetitionFilter"]], "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.flagged_words_filter": [[8, 1, 1, "", "FlaggedWordFilter"]], "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.language_id_score_filter": [[8, 1, 1, "", "LanguageIDScoreFilter"]], "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.maximum_line_length_filter": [[8, 1, 1, "", "MaximumLineLengthFilter"]], "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.perplexity_filter": [[8, 1, 1, "", "PerplexityFilter"]], "data_juicer.ops.filter.perplexity_filter.PerplexityFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.special_characters_filter": [[8, 1, 1, "", "SpecialCharactersFilter"]], "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.specified_field_filter": [[8, 1, 1, "", "SpecifiedFieldFilter"]], "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.specified_numeric_field_filter": [[8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 3, 1, "", "is_number"]], "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.stopwords_filter": [[8, 1, 1, "", "StopWordsFilter"]], "data_juicer.ops.filter.stopwords_filter.StopWordsFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.suffix_filter": [[8, 1, 1, "", "SuffixFilter"]], "data_juicer.ops.filter.suffix_filter.SuffixFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.text_length_filter": [[8, 1, 1, "", "TextLengthFilter"]], "data_juicer.ops.filter.text_length_filter.TextLengthFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.token_num_filter": [[8, 1, 1, "", "TokenNumFilter"]], "data_juicer.ops.filter.token_num_filter.TokenNumFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.word_num_filter": [[8, 1, 1, "", "WordNumFilter"]], "data_juicer.ops.filter.word_num_filter.WordNumFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.word_repetition_filter": [[8, 1, 1, "", "WordRepetitionFilter"]], "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter": [[8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.load": [[5, 3, 1, "", "load_ops"]], "data_juicer.ops.mapper": [[9, 0, 0, "-", "clean_copyright_mapper"], [9, 0, 0, "-", "clean_email_mapper"], [9, 0, 0, "-", "clean_html_mapper"], [9, 0, 0, "-", "clean_ip_mapper"], [9, 0, 0, "-", "clean_links_mapper"], [9, 0, 0, "-", "expand_macro_mapper"], [9, 0, 0, "-", "fix_unicode_mapper"], [9, 0, 0, "-", "nlpaug_en_mapper"], [9, 0, 0, "-", "nlpcda_zh_mapper"], [9, 0, 0, "-", "punctuation_normalization_mapper"], [9, 0, 0, "-", "remove_bibliography_mapper"], [9, 0, 0, "-", "remove_comments_mapper"], [9, 0, 0, "-", "remove_header_mapper"], [9, 0, 0, "-", "remove_long_words_mapper"], [9, 0, 0, "-", "remove_specific_chars_mapper"], [9, 0, 0, "-", "remove_table_text_mapper"], [9, 0, 0, "-", "remove_words_with_incorrect_substrings_mapper"], [9, 0, 0, "-", "sentence_split_mapper"], [9, 0, 0, "-", "whitespace_normalization_mapper"]], "data_juicer.ops.mapper.clean_copyright_mapper": [[9, 1, 1, "", "CleanCopyrightMapper"]], "data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.clean_email_mapper": [[9, 1, 1, "", "CleanEmailMapper"]], "data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.clean_html_mapper": [[9, 1, 1, "", "CleanHtmlMapper"]], "data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.clean_ip_mapper": [[9, 1, 1, "", "CleanIpMapper"]], "data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.clean_links_mapper": [[9, 1, 1, "", "CleanLinksMapper"]], "data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.expand_macro_mapper": [[9, 1, 1, "", "ExpandMacroMapper"]], "data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.fix_unicode_mapper": [[9, 1, 1, "", "FixUnicodeMapper"]], "data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.nlpaug_en_mapper": [[9, 1, 1, "", "NlpaugEnMapper"]], "data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.nlpcda_zh_mapper": [[9, 1, 1, "", "NlpcdaZhMapper"]], "data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.punctuation_normalization_mapper": [[9, 1, 1, "", "PunctuationNormalizationMapper"]], "data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_bibliography_mapper": [[9, 1, 1, "", "RemoveBibliographyMapper"]], "data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_comments_mapper": [[9, 1, 1, "", "RemoveCommentsMapper"]], "data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_header_mapper": [[9, 1, 1, "", "RemoveHeaderMapper"]], "data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_long_words_mapper": [[9, 1, 1, "", "RemoveLongWordsMapper"]], "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.remove_specific_chars_mapper": [[9, 1, 1, "", "RemoveSpecificCharsMapper"]], "data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_table_text_mapper": [[9, 1, 1, "", "RemoveTableTextMapper"]], "data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper": [[9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"]], "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.sentence_split_mapper": [[9, 1, 1, "", "SentenceSplitMapper"]], "data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.mapper.whitespace_normalization_mapper": [[9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "process"]], "data_juicer.ops.op_fusion": [[5, 1, 1, "", "FusedFilter"], [5, 3, 1, "", "fuse_filter_group"], [5, 3, 1, "", "fuse_operators"]], "data_juicer.ops.op_fusion.FusedFilter": [[5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"]], "data_juicer.ops.selector": [[10, 0, 0, "-", "frequency_specified_field_selector"], [10, 0, 0, "-", "topk_specified_field_selector"]], "data_juicer.ops.selector.frequency_specified_field_selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"]], "data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "process"]], "data_juicer.ops.selector.topk_specified_field_selector": [[10, 1, 1, "", "TopkSpecifiedFieldSelector"], [10, 3, 1, "", "to_number"]], "data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "process"]], "data_juicer.utils": [[12, 0, 0, "-", "asset_utils"], [12, 0, 0, "-", "cache_utils"], [12, 0, 0, "-", "ckpt_utils"], [12, 0, 0, "-", "compress"], [12, 0, 0, "-", "constant"], [12, 0, 0, "-", "file_utils"], [12, 0, 0, "-", "fingerprint_utils"], [12, 0, 0, "-", "logger_utils"], [12, 0, 0, "-", "model_utils"], [12, 0, 0, "-", "registry"]], "data_juicer.utils.asset_utils": [[12, 3, 1, "", "load_words_asset"]], "data_juicer.utils.ckpt_utils": [[12, 1, 1, "", "CheckpointManager"]], "data_juicer.utils.ckpt_utils.CheckpointManager": [[12, 2, 1, "", "check_ckpt"], [12, 2, 1, "", "check_ops_to_skip"], [12, 2, 1, "", "get_left_process_list"], [12, 2, 1, "", "load_ckpt"], [12, 2, 1, "", "record"], [12, 2, 1, "", "save_ckpt"]], "data_juicer.utils.compress": [[12, 1, 1, "", "BaseCompressor"], [12, 1, 1, "", "CacheCompressManager"], [12, 1, 1, "", "CompressManager"], [12, 1, 1, "", "CompressionOff"], [12, 1, 1, "", "Compressor"], [12, 1, 1, "", "Extractor"], [12, 1, 1, "", "FileLock"], [12, 1, 1, "", "GzipCompressor"], [12, 1, 1, "", "Lz4Compressor"], [12, 1, 1, "", "ZstdCompressor"], [12, 3, 1, "", "cleanup_compressed_cache_files"], [12, 3, 1, "", "compress"], [12, 3, 1, "", "decompress"]], "data_juicer.utils.compress.BaseCompressor": [[12, 2, 1, "", "compress"]], "data_juicer.utils.compress.CacheCompressManager": [[12, 2, 1, "", "cleanup_cache_files"], [12, 2, 1, "", "compress"], [12, 2, 1, "", "decompress"], [12, 2, 1, "", "format_cache_file_name"]], "data_juicer.utils.compress.CompressManager": [[12, 2, 1, "", "compress"], [12, 2, 1, "", "decompress"]], "data_juicer.utils.compress.Compressor": [[12, 2, 1, "", "compress"], [12, 4, 1, "", "compressors"]], "data_juicer.utils.compress.Extractor": [[12, 2, 1, "", "extract"]], "data_juicer.utils.compress.GzipCompressor": [[12, 2, 1, "", "compress"]], "data_juicer.utils.compress.Lz4Compressor": [[12, 2, 1, "", "compress"]], "data_juicer.utils.compress.ZstdCompressor": [[12, 2, 1, "", "compress"]], "data_juicer.utils.constant": [[12, 1, 1, "", "Fields"], [12, 1, 1, "", "HashKeys"], [12, 1, 1, "", "InterVars"], [12, 1, 1, "", "StatsKeys"]], "data_juicer.utils.constant.Fields": [[12, 4, 1, "", "context"], [12, 4, 1, "", "meta"], [12, 4, 1, "", "stats"], [12, 4, 1, "", "suffix"]], "data_juicer.utils.constant.HashKeys": [[12, 4, 1, "", "hash"], [12, 4, 1, "", "minhash"], [12, 4, 1, "", "simhash"]], "data_juicer.utils.constant.InterVars": [[12, 4, 1, "", "lines"], [12, 4, 1, "", "refined_words"], [12, 4, 1, "", "words"]], "data_juicer.utils.constant.StatsKeys": [[12, 4, 1, "", "alnum_ratio"], [12, 4, 1, "", "alpha_token_ratio"], [12, 4, 1, "", "avg_line_length"], [12, 4, 1, "", "char_rep_ratio"], [12, 4, 1, "", "flagged_words_ratio"], [12, 4, 1, "", "lang"], [12, 4, 1, "", "lang_score"], [12, 4, 1, "", "max_line_length"], [12, 4, 1, "", "num_token"], [12, 4, 1, "", "num_words"], [12, 4, 1, "", "perplexity"], [12, 4, 1, "", "special_char_ratio"], [12, 4, 1, "", "stopwords_ratio"], [12, 4, 1, "", "text_len"], [12, 4, 1, "", "word_rep_ratio"]], "data_juicer.utils.file_utils": [[12, 3, 1, "", "find_files_with_suffix"], [12, 3, 1, "", "is_absolute_path"]], "data_juicer.utils.fingerprint_utils": [[12, 1, 1, "", "Hasher"], [12, 3, 1, "", "generate_fingerprint"], [12, 3, 1, "", "update_fingerprint"]], "data_juicer.utils.fingerprint_utils.Hasher": [[12, 4, 1, "", "dispatch"], [12, 2, 1, "", "hash"], [12, 2, 1, "", "hash_bytes"], [12, 2, 1, "", "hash_default"], [12, 2, 1, "", "hexdigest"], [12, 2, 1, "", "update"]], "data_juicer.utils.logger_utils": [[12, 1, 1, "", "HiddenPrints"], [12, 1, 1, "", "StreamToLoguru"], [12, 3, 1, "", "get_caller_name"], [12, 3, 1, "", "get_log_file_path"], [12, 3, 1, "", "redirect_sys_output"], [12, 3, 1, "", "setup_logger"]], "data_juicer.utils.logger_utils.StreamToLoguru": [[12, 2, 1, "", "flush"], [12, 2, 1, "", "write"]], "data_juicer.utils.model_utils": [[12, 3, 1, "", "check_model"], [12, 3, 1, "", "get_model"], [12, 3, 1, "", "prepare_diversity_model"], [12, 3, 1, "", "prepare_fasttext_model"], [12, 3, 1, "", "prepare_huggingface_tokenizer"], [12, 3, 1, "", "prepare_kenlm_model"], [12, 3, 1, "", "prepare_model"], [12, 3, 1, "", "prepare_nltk_model"], [12, 3, 1, "", "prepare_sentencepiece_model"]], "data_juicer.utils.registry": [[12, 1, 1, "", "Registry"]], "data_juicer.utils.registry.Registry": [[12, 2, 1, "", "get"], [12, 2, 1, "", "list"], [12, 5, 1, "", "modules"], [12, 5, 1, "", "name"], [12, 2, 1, "", "register_module"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"d": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "t": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "_": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "j": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "u": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "c": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "e": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "r": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "n": [1, 2, 6], "l": [1, 7, 8, 10, 11, 12], "y": 1, "": [1, 5, 6, 7, 8, 9, 10, 11, 12, 13], "data_juic": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14], "analysi": 1, "column_wise_analysi": 1, "diversity_analysi": 1, "overall_analysi": 1, "o": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "f": [2, 4, 8], "g": 2, "config": 2, "core": 3, "analys": 3, "data": [3, 13], "executor": 3, "export": 3, "ray_executor": 3, "tracer": 3, "m": [4, 6, 9], "format": 4, "csv_formatt": 4, "formatt": 4, "json_formatt": 4, "load": [4, 5], "mixture_formatt": 4, "parquet_formatt": 4, "text_formatt": 4, "tsv_formatt": 4, "p": [5, 6, 7, 8, 9, 10], "op": [5, 6, 7, 8, 9, 10], "base_op": 5, "op_fus": 5, "common": 6, "helper_func": 6, "special_charact": 6, "dedupl": 7, "document_dedupl": 7, "document_minhash_dedupl": 7, "document_simhash_dedupl": 7, "filter": 8, "alphanumeric_filt": 8, "average_line_length_filt": 8, "character_repetition_filt": 8, "flagged_words_filt": 8, "language_id_score_filt": 8, "maximum_line_length_filt": 8, "perplexity_filt": 8, "special_characters_filt": 8, "specified_field_filt": 8, "specified_numeric_field_filt": 8, "stopwords_filt": 8, "suffix_filt": 8, "text_length_filt": 8, "token_num_filt": 8, "word_num_filt": 8, "word_repetition_filt": 8, "mapper": 9, "clean_copyright_mapp": 9, "clean_email_mapp": 9, "clean_html_mapp": 9, "clean_ip_mapp": 9, "clean_links_mapp": 9, "expand_macro_mapp": 9, "fix_unicode_mapp": 9, "nlpaug_en_mapp": 9, "nlpcda_zh_mapp": 9, "punctuation_normalization_mapp": 9, "remove_bibliography_mapp": 9, "remove_comments_mapp": 9, "remove_header_mapp": 9, "remove_long_words_mapp": 9, "remove_specific_chars_mapp": 9, "remove_table_text_mapp": 9, "remove_words_with_incorrect_substrings_mapp": 9, "sentence_split_mapp": 9, "whitespace_normalization_mapp": 9, "selector": 10, "frequency_specified_field_selector": 10, "topk_specified_field_selector": 10, "util": 12, "asset_util": 12, "cache_util": 12, "ckpt_util": 12, "compress": 12, "constant": 12, "file_util": 12, "fingerprint_util": 12, "logger_util": 12, "model_util": 12, "registri": 12, "welcom": 13, "juicer": 13, "document": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"d a t a _ j u i c e r": [[0, "d-a-t-a-j-u-i-c-e-r"]], "d a t a _ j u i c e r . a n a l y s i s": [[1, "d-a-t-a-j-u-i-c-e-r-a-n-a-l-y-s-i-s"]], "data_juicer.analysis.column_wise_analysis": [[1, "module-data_juicer.analysis.column_wise_analysis"]], "data_juicer.analysis.diversity_analysis": [[1, "module-data_juicer.analysis.diversity_analysis"]], "data_juicer.analysis.overall_analysis": [[1, "module-data_juicer.analysis.overall_analysis"]], "d a t a _ j u i c e r . c o n f i g": [[2, "d-a-t-a-j-u-i-c-e-r-c-o-n-f-i-g"]], "data_juicer.config.config": [[2, "module-data_juicer.config.config"]], "d a t a _ j u i c e r . c o r e": [[3, "d-a-t-a-j-u-i-c-e-r-c-o-r-e"]], "data_juicer.core.analyser": [[3, "module-data_juicer.core.analyser"]], "data_juicer.core.data": [[3, "module-data_juicer.core.data"]], "data_juicer.core.executor": [[3, "module-data_juicer.core.executor"]], "data_juicer.core.exporter": [[3, "module-data_juicer.core.exporter"]], "data_juicer.core.ray_executor": [[3, "module-data_juicer.core.ray_executor"]], "data_juicer.core.tracer": [[3, "module-data_juicer.core.tracer"]], "d a t a _ j u i c e r . f o r m a t": [[4, "d-a-t-a-j-u-i-c-e-r-f-o-r-m-a-t"]], "data_juicer.format.csv_formatter": [[4, "module-data_juicer.format.csv_formatter"]], "data_juicer.format.formatter": [[4, "module-data_juicer.format.formatter"]], "data_juicer.format.json_formatter": [[4, "module-data_juicer.format.json_formatter"]], "data_juicer.format.load": [[4, "module-data_juicer.format.load"]], "data_juicer.format.mixture_formatter": [[4, "module-data_juicer.format.mixture_formatter"]], "data_juicer.format.parquet_formatter": [[4, "module-data_juicer.format.parquet_formatter"]], "data_juicer.format.text_formatter": [[4, "module-data_juicer.format.text_formatter"]], "data_juicer.format.tsv_formatter": [[4, "module-data_juicer.format.tsv_formatter"]], "d a t a _ j u i c e r . o p s": [[5, "d-a-t-a-j-u-i-c-e-r-o-p-s"]], "data_juicer.ops.base_op": [[5, "module-data_juicer.ops.base_op"]], "data_juicer.ops.load": [[5, "module-data_juicer.ops.load"]], "data_juicer.ops.op_fusion": [[5, "module-data_juicer.ops.op_fusion"]], "d a t a _ j u i c e r . o p s . c o m m o n": [[6, "d-a-t-a-j-u-i-c-e-r-o-p-s-c-o-m-m-o-n"]], "data_juicer.ops.common.helper_func": [[6, "module-data_juicer.ops.common.helper_func"]], "data_juicer.ops.common.special_characters": [[6, "module-data_juicer.ops.common.special_characters"]], "d a t a _ j u i c e r . o p s . d e d u p l i c a t o r": [[7, "d-a-t-a-j-u-i-c-e-r-o-p-s-d-e-d-u-p-l-i-c-a-t-o-r"]], "data_juicer.ops.deduplicator.document_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_deduplicator"]], "data_juicer.ops.deduplicator.document_minhash_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_minhash_deduplicator"]], "data_juicer.ops.deduplicator.document_simhash_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_simhash_deduplicator"]], "d a t a _ j u i c e r . o p s . f i l t e r": [[8, "d-a-t-a-j-u-i-c-e-r-o-p-s-f-i-l-t-e-r"]], "data_juicer.ops.filter.alphanumeric_filter": [[8, "module-data_juicer.ops.filter.alphanumeric_filter"]], "data_juicer.ops.filter.average_line_length_filter": [[8, "module-data_juicer.ops.filter.average_line_length_filter"]], "data_juicer.ops.filter.character_repetition_filter": [[8, "module-data_juicer.ops.filter.character_repetition_filter"]], "data_juicer.ops.filter.flagged_words_filter": [[8, "module-data_juicer.ops.filter.flagged_words_filter"]], "data_juicer.ops.filter.language_id_score_filter": [[8, "module-data_juicer.ops.filter.language_id_score_filter"]], "data_juicer.ops.filter.maximum_line_length_filter": [[8, "module-data_juicer.ops.filter.maximum_line_length_filter"]], "data_juicer.ops.filter.perplexity_filter": [[8, "module-data_juicer.ops.filter.perplexity_filter"]], "data_juicer.ops.filter.special_characters_filter": [[8, "module-data_juicer.ops.filter.special_characters_filter"]], "data_juicer.ops.filter.specified_field_filter": [[8, "module-data_juicer.ops.filter.specified_field_filter"]], "data_juicer.ops.filter.specified_numeric_field_filter": [[8, "module-data_juicer.ops.filter.specified_numeric_field_filter"]], "data_juicer.ops.filter.stopwords_filter": [[8, "module-data_juicer.ops.filter.stopwords_filter"]], "data_juicer.ops.filter.suffix_filter": [[8, "module-data_juicer.ops.filter.suffix_filter"]], "data_juicer.ops.filter.text_length_filter": [[8, "module-data_juicer.ops.filter.text_length_filter"]], "data_juicer.ops.filter.token_num_filter": [[8, "module-data_juicer.ops.filter.token_num_filter"]], "data_juicer.ops.filter.word_num_filter": [[8, "module-data_juicer.ops.filter.word_num_filter"]], "data_juicer.ops.filter.word_repetition_filter": [[8, "module-data_juicer.ops.filter.word_repetition_filter"]], "d a t a _ j u i c e r . o p s . m a p p e r": [[9, "d-a-t-a-j-u-i-c-e-r-o-p-s-m-a-p-p-e-r"]], "data_juicer.ops.mapper.clean_copyright_mapper": [[9, "module-data_juicer.ops.mapper.clean_copyright_mapper"]], "data_juicer.ops.mapper.clean_email_mapper": [[9, "module-data_juicer.ops.mapper.clean_email_mapper"]], "data_juicer.ops.mapper.clean_html_mapper": [[9, "module-data_juicer.ops.mapper.clean_html_mapper"]], "data_juicer.ops.mapper.clean_ip_mapper": [[9, "module-data_juicer.ops.mapper.clean_ip_mapper"]], "data_juicer.ops.mapper.clean_links_mapper": [[9, "module-data_juicer.ops.mapper.clean_links_mapper"]], "data_juicer.ops.mapper.expand_macro_mapper": [[9, "module-data_juicer.ops.mapper.expand_macro_mapper"]], "data_juicer.ops.mapper.fix_unicode_mapper": [[9, "module-data_juicer.ops.mapper.fix_unicode_mapper"]], "data_juicer.ops.mapper.nlpaug_en_mapper": [[9, "module-data_juicer.ops.mapper.nlpaug_en_mapper"]], "data_juicer.ops.mapper.nlpcda_zh_mapper": [[9, "module-data_juicer.ops.mapper.nlpcda_zh_mapper"]], "data_juicer.ops.mapper.punctuation_normalization_mapper": [[9, "module-data_juicer.ops.mapper.punctuation_normalization_mapper"]], "data_juicer.ops.mapper.remove_bibliography_mapper": [[9, "module-data_juicer.ops.mapper.remove_bibliography_mapper"]], "data_juicer.ops.mapper.remove_comments_mapper": [[9, "module-data_juicer.ops.mapper.remove_comments_mapper"]], "data_juicer.ops.mapper.remove_header_mapper": [[9, "module-data_juicer.ops.mapper.remove_header_mapper"]], "data_juicer.ops.mapper.remove_long_words_mapper": [[9, "module-data_juicer.ops.mapper.remove_long_words_mapper"]], "data_juicer.ops.mapper.remove_specific_chars_mapper": [[9, "module-data_juicer.ops.mapper.remove_specific_chars_mapper"]], "data_juicer.ops.mapper.remove_table_text_mapper": [[9, "module-data_juicer.ops.mapper.remove_table_text_mapper"]], "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper": [[9, "module-data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper"]], "data_juicer.ops.mapper.sentence_split_mapper": [[9, "module-data_juicer.ops.mapper.sentence_split_mapper"]], "data_juicer.ops.mapper.whitespace_normalization_mapper": [[9, "module-data_juicer.ops.mapper.whitespace_normalization_mapper"]], "d a t a _ j u i c e r . o p s . s e l e c t o r": [[10, "d-a-t-a-j-u-i-c-e-r-o-p-s-s-e-l-e-c-t-o-r"]], "data_juicer.ops.selector.frequency_specified_field_selector": [[10, "module-data_juicer.ops.selector.frequency_specified_field_selector"]], "data_juicer.ops.selector.topk_specified_field_selector": [[10, "module-data_juicer.ops.selector.topk_specified_field_selector"]], "d a t a _ j u i c e r . t o o l s": [[11, "d-a-t-a-j-u-i-c-e-r-t-o-o-l-s"]], "d a t a _ j u i c e r . u t i l s": [[12, "d-a-t-a-j-u-i-c-e-r-u-t-i-l-s"]], "data_juicer.utils.asset_utils": [[12, "module-data_juicer.utils.asset_utils"]], "data_juicer.utils.cache_utils": [[12, "module-data_juicer.utils.cache_utils"]], "data_juicer.utils.ckpt_utils": [[12, "module-data_juicer.utils.ckpt_utils"]], "data_juicer.utils.compress": [[12, "module-data_juicer.utils.compress"]], "data_juicer.utils.constant": [[12, "module-data_juicer.utils.constant"]], "data_juicer.utils.file_utils": [[12, "module-data_juicer.utils.file_utils"]], "data_juicer.utils.fingerprint_utils": [[12, "module-data_juicer.utils.fingerprint_utils"]], "data_juicer.utils.logger_utils": [[12, "module-data_juicer.utils.logger_utils"]], "data_juicer.utils.model_utils": [[12, "module-data_juicer.utils.model_utils"]], "data_juicer.utils.registry": [[12, "module-data_juicer.utils.registry"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "data_juicer": [[13, "data-juicer"], [14, "data-juicer"]], "Indices and tables": [[13, "indices-and-tables"]]}, "indexentries": {"columnwiseanalysis (class in data_juicer.analysis.column_wise_analysis)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis.diversity_analysis)": [[1, "data_juicer.analysis.diversity_analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis.overall_analysis)": [[1, "data_juicer.analysis.overall_analysis.OverallAnalysis"]], "analyse() (data_juicer.analysis.column_wise_analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.analyse"]], "analyse() (data_juicer.analysis.diversity_analysis.diversityanalysis method)": [[1, "data_juicer.analysis.diversity_analysis.DiversityAnalysis.analyse"]], "analyse() (data_juicer.analysis.overall_analysis.overallanalysis method)": [[1, "data_juicer.analysis.overall_analysis.OverallAnalysis.analyse"]], "compute() (data_juicer.analysis.diversity_analysis.diversityanalysis method)": [[1, "data_juicer.analysis.diversity_analysis.DiversityAnalysis.compute"]], "data_juicer.analysis.column_wise_analysis": [[1, "module-data_juicer.analysis.column_wise_analysis"]], "data_juicer.analysis.diversity_analysis": [[1, "module-data_juicer.analysis.diversity_analysis"]], "data_juicer.analysis.overall_analysis": [[1, "module-data_juicer.analysis.overall_analysis"]], "draw_box() (data_juicer.analysis.column_wise_analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.column_wise_analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.draw_hist"]], "find_root_verb_and_its_dobj() (in module data_juicer.analysis.diversity_analysis)": [[1, "data_juicer.analysis.diversity_analysis.find_root_verb_and_its_dobj"]], "find_root_verb_and_its_dobj_in_string() (in module data_juicer.analysis.diversity_analysis)": [[1, "data_juicer.analysis.diversity_analysis.find_root_verb_and_its_dobj_in_string"]], "get_diversity() (in module data_juicer.analysis.diversity_analysis)": [[1, "data_juicer.analysis.diversity_analysis.get_diversity"]], "get_row_col() (in module data_juicer.analysis.column_wise_analysis)": [[1, "data_juicer.analysis.column_wise_analysis.get_row_col"]], "module": [[1, "module-data_juicer.analysis.column_wise_analysis"], [1, "module-data_juicer.analysis.diversity_analysis"], [1, "module-data_juicer.analysis.overall_analysis"], [2, "module-data_juicer.config.config"], [3, "module-data_juicer.core.analyser"], [3, "module-data_juicer.core.data"], [3, "module-data_juicer.core.executor"], [3, "module-data_juicer.core.exporter"], [3, "module-data_juicer.core.ray_executor"], [3, "module-data_juicer.core.tracer"], [4, "module-data_juicer.format.csv_formatter"], [4, "module-data_juicer.format.formatter"], [4, "module-data_juicer.format.json_formatter"], [4, "module-data_juicer.format.load"], [4, "module-data_juicer.format.mixture_formatter"], [4, "module-data_juicer.format.parquet_formatter"], [4, "module-data_juicer.format.text_formatter"], [4, "module-data_juicer.format.tsv_formatter"], [5, "module-data_juicer.ops.base_op"], [5, "module-data_juicer.ops.load"], [5, "module-data_juicer.ops.op_fusion"], [6, "module-data_juicer.ops.common.helper_func"], [6, "module-data_juicer.ops.common.special_characters"], [7, "module-data_juicer.ops.deduplicator.document_deduplicator"], [7, "module-data_juicer.ops.deduplicator.document_minhash_deduplicator"], [7, "module-data_juicer.ops.deduplicator.document_simhash_deduplicator"], [8, "module-data_juicer.ops.filter.alphanumeric_filter"], [8, "module-data_juicer.ops.filter.average_line_length_filter"], [8, "module-data_juicer.ops.filter.character_repetition_filter"], [8, "module-data_juicer.ops.filter.flagged_words_filter"], [8, "module-data_juicer.ops.filter.language_id_score_filter"], [8, "module-data_juicer.ops.filter.maximum_line_length_filter"], [8, "module-data_juicer.ops.filter.perplexity_filter"], [8, "module-data_juicer.ops.filter.special_characters_filter"], [8, "module-data_juicer.ops.filter.specified_field_filter"], [8, "module-data_juicer.ops.filter.specified_numeric_field_filter"], [8, "module-data_juicer.ops.filter.stopwords_filter"], [8, "module-data_juicer.ops.filter.suffix_filter"], [8, "module-data_juicer.ops.filter.text_length_filter"], [8, "module-data_juicer.ops.filter.token_num_filter"], [8, "module-data_juicer.ops.filter.word_num_filter"], [8, "module-data_juicer.ops.filter.word_repetition_filter"], [9, "module-data_juicer.ops.mapper.clean_copyright_mapper"], [9, "module-data_juicer.ops.mapper.clean_email_mapper"], [9, "module-data_juicer.ops.mapper.clean_html_mapper"], [9, "module-data_juicer.ops.mapper.clean_ip_mapper"], [9, "module-data_juicer.ops.mapper.clean_links_mapper"], [9, "module-data_juicer.ops.mapper.expand_macro_mapper"], [9, "module-data_juicer.ops.mapper.fix_unicode_mapper"], [9, "module-data_juicer.ops.mapper.nlpaug_en_mapper"], [9, "module-data_juicer.ops.mapper.nlpcda_zh_mapper"], [9, "module-data_juicer.ops.mapper.punctuation_normalization_mapper"], [9, "module-data_juicer.ops.mapper.remove_bibliography_mapper"], [9, "module-data_juicer.ops.mapper.remove_comments_mapper"], [9, "module-data_juicer.ops.mapper.remove_header_mapper"], [9, "module-data_juicer.ops.mapper.remove_long_words_mapper"], [9, "module-data_juicer.ops.mapper.remove_specific_chars_mapper"], [9, "module-data_juicer.ops.mapper.remove_table_text_mapper"], [9, "module-data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper"], [9, "module-data_juicer.ops.mapper.sentence_split_mapper"], [9, "module-data_juicer.ops.mapper.whitespace_normalization_mapper"], [10, "module-data_juicer.ops.selector.frequency_specified_field_selector"], [10, "module-data_juicer.ops.selector.topk_specified_field_selector"], [12, "module-data_juicer.utils.asset_utils"], [12, "module-data_juicer.utils.cache_utils"], [12, "module-data_juicer.utils.ckpt_utils"], [12, "module-data_juicer.utils.compress"], [12, "module-data_juicer.utils.constant"], [12, "module-data_juicer.utils.file_utils"], [12, "module-data_juicer.utils.fingerprint_utils"], [12, "module-data_juicer.utils.logger_utils"], [12, "module-data_juicer.utils.model_utils"], [12, "module-data_juicer.utils.registry"]], "config_backup() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.config_backup"]], "data_juicer.config.config": [[2, "module-data_juicer.config.config"]], "display_config() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.display_config"]], "init_configs() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.init_configs"]], "init_setup_from_cfg() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.init_setup_from_cfg"]], "sort_op_by_types_and_names() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.sort_op_by_types_and_names"]], "analyser (class in data_juicer.core.analyser)": [[3, "data_juicer.core.analyser.Analyser"]], "executor (class in data_juicer.core.executor)": [[3, "data_juicer.core.executor.Executor"]], "exporter (class in data_juicer.core.exporter)": [[3, "data_juicer.core.exporter.Exporter"]], "gib (data_juicer.core.exporter.exporter attribute)": [[3, "data_juicer.core.exporter.Exporter.GiB"]], "kib (data_juicer.core.exporter.exporter attribute)": [[3, "data_juicer.core.exporter.Exporter.KiB"]], "mib (data_juicer.core.exporter.exporter attribute)": [[3, "data_juicer.core.exporter.Exporter.MiB"]], "nesteddataset (class in data_juicer.core.data)": [[3, "data_juicer.core.data.NestedDataset"]], "nesteddatasetdict (class in data_juicer.core.data)": [[3, "data_juicer.core.data.NestedDatasetDict"]], "nestedquerydict (class in data_juicer.core.data)": [[3, "data_juicer.core.data.NestedQueryDict"]], "rayexecutor (class in data_juicer.core.ray_executor)": [[3, "data_juicer.core.ray_executor.RayExecutor"]], "tib (data_juicer.core.exporter.exporter attribute)": [[3, "data_juicer.core.exporter.Exporter.TiB"]], "tracer (class in data_juicer.core.tracer)": [[3, "data_juicer.core.tracer.Tracer"]], "add_column() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.add_column"]], "cleanup_cache_files() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.cleanup_cache_files"]], "data_juicer.core.analyser": [[3, "module-data_juicer.core.analyser"]], "data_juicer.core.data": [[3, "module-data_juicer.core.data"]], "data_juicer.core.executor": [[3, "module-data_juicer.core.executor"]], "data_juicer.core.exporter": [[3, "module-data_juicer.core.exporter"]], "data_juicer.core.ray_executor": [[3, "module-data_juicer.core.ray_executor"]], "data_juicer.core.tracer": [[3, "module-data_juicer.core.tracer"]], "export() (data_juicer.core.exporter.exporter method)": [[3, "data_juicer.core.exporter.Exporter.export"]], "filter() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.filter"]], "from_dict() (data_juicer.core.data.nesteddataset class method)": [[3, "data_juicer.core.data.NestedDataset.from_dict"]], "map() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.map"]], "map() (data_juicer.core.data.nesteddatasetdict method)": [[3, "data_juicer.core.data.NestedDatasetDict.map"]], "nested_obj_factory() (in module data_juicer.core.data)": [[3, "data_juicer.core.data.nested_obj_factory"]], "nested_query() (in module data_juicer.core.data)": [[3, "data_juicer.core.data.nested_query"]], "remove_columns() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyser.analyser method)": [[3, "data_juicer.core.analyser.Analyser.run"]], "run() (data_juicer.core.executor.executor method)": [[3, "data_juicer.core.executor.Executor.run"]], "run() (data_juicer.core.ray_executor.rayexecutor method)": [[3, "data_juicer.core.ray_executor.RayExecutor.run"]], "select() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.select"]], "select_columns() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.select_columns"]], "to_jsonl() (data_juicer.core.exporter.exporter static method)": [[3, "data_juicer.core.exporter.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter.exporter static method)": [[3, "data_juicer.core.exporter.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.trace_mapper"]], "wrap_func_with_nested_access() (in module data_juicer.core.data)": [[3, "data_juicer.core.data.wrap_func_with_nested_access"]], "baseformatter (class in data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.BaseFormatter"]], "csvformatter (class in data_juicer.format.csv_formatter)": [[4, "data_juicer.format.csv_formatter.CsvFormatter"]], "jsonformatter (class in data_juicer.format.json_formatter)": [[4, "data_juicer.format.json_formatter.JsonFormatter"]], "localformatter (class in data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.LocalFormatter"]], "mixtureformatter (class in data_juicer.format.mixture_formatter)": [[4, "data_juicer.format.mixture_formatter.MixtureFormatter"]], "parquetformatter (class in data_juicer.format.parquet_formatter)": [[4, "data_juicer.format.parquet_formatter.ParquetFormatter"]], "remoteformatter (class in data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.RemoteFormatter"]], "suffixes (data_juicer.format.csv_formatter.csvformatter attribute)": [[4, "data_juicer.format.csv_formatter.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.json_formatter.jsonformatter attribute)": [[4, "data_juicer.format.json_formatter.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquet_formatter.parquetformatter attribute)": [[4, "data_juicer.format.parquet_formatter.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.text_formatter.textformatter attribute)": [[4, "data_juicer.format.text_formatter.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsv_formatter.tsvformatter attribute)": [[4, "data_juicer.format.tsv_formatter.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format.text_formatter)": [[4, "data_juicer.format.text_formatter.TextFormatter"]], "tsvformatter (class in data_juicer.format.tsv_formatter)": [[4, "data_juicer.format.tsv_formatter.TsvFormatter"]], "add_suffixes() (in module data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.add_suffixes"]], "data_juicer.format.csv_formatter": [[4, "module-data_juicer.format.csv_formatter"]], "data_juicer.format.formatter": [[4, "module-data_juicer.format.formatter"]], "data_juicer.format.json_formatter": [[4, "module-data_juicer.format.json_formatter"]], "data_juicer.format.load": [[4, "module-data_juicer.format.load"]], "data_juicer.format.mixture_formatter": [[4, "module-data_juicer.format.mixture_formatter"]], "data_juicer.format.parquet_formatter": [[4, "module-data_juicer.format.parquet_formatter"]], "data_juicer.format.text_formatter": [[4, "module-data_juicer.format.text_formatter"]], "data_juicer.format.tsv_formatter": [[4, "module-data_juicer.format.tsv_formatter"]], "extract_txt_from_docx() (in module data_juicer.format.text_formatter)": [[4, "data_juicer.format.text_formatter.extract_txt_from_docx"]], "extract_txt_from_pdf() (in module data_juicer.format.text_formatter)": [[4, "data_juicer.format.text_formatter.extract_txt_from_pdf"]], "load_dataset() (data_juicer.format.formatter.baseformatter method)": [[4, "data_juicer.format.formatter.BaseFormatter.load_dataset"]], "load_dataset() (data_juicer.format.formatter.localformatter method)": [[4, "data_juicer.format.formatter.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.formatter.remoteformatter method)": [[4, "data_juicer.format.formatter.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixture_formatter.mixtureformatter method)": [[4, "data_juicer.format.mixture_formatter.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.text_formatter.textformatter method)": [[4, "data_juicer.format.text_formatter.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.load_formatter"]], "load_formatter() (in module data_juicer.format.load)": [[4, "data_juicer.format.load.load_formatter"]], "unify_format() (in module data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.unify_format"]], "deduplicator (class in data_juicer.ops.base_op)": [[5, "data_juicer.ops.base_op.Deduplicator"]], "filter (class in data_juicer.ops.base_op)": [[5, "data_juicer.ops.base_op.Filter"]], "fusedfilter (class in data_juicer.ops.op_fusion)": [[5, "data_juicer.ops.op_fusion.FusedFilter"]], "mapper (class in data_juicer.ops.base_op)": [[5, "data_juicer.ops.base_op.Mapper"]], "selector (class in data_juicer.ops.base_op)": [[5, "data_juicer.ops.base_op.Selector"]], "compute_hash() (data_juicer.ops.base_op.deduplicator method)": [[5, "data_juicer.ops.base_op.Deduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.base_op.filter method)": [[5, "data_juicer.ops.base_op.Filter.compute_stats"]], "compute_stats() (data_juicer.ops.op_fusion.fusedfilter method)": [[5, "data_juicer.ops.op_fusion.FusedFilter.compute_stats"]], "data_juicer.ops.base_op": [[5, "module-data_juicer.ops.base_op"]], "data_juicer.ops.load": [[5, "module-data_juicer.ops.load"]], "data_juicer.ops.op_fusion": [[5, "module-data_juicer.ops.op_fusion"]], "fuse_filter_group() (in module data_juicer.ops.op_fusion)": [[5, "data_juicer.ops.op_fusion.fuse_filter_group"]], "fuse_operators() (in module data_juicer.ops.op_fusion)": [[5, "data_juicer.ops.op_fusion.fuse_operators"]], "is_batched_op() (data_juicer.ops.base_op.mapper method)": [[5, "data_juicer.ops.base_op.Mapper.is_batched_op"]], "load_ops() (in module data_juicer.ops.load)": [[5, "data_juicer.ops.load.load_ops"]], "process() (data_juicer.ops.base_op.deduplicator method)": [[5, "data_juicer.ops.base_op.Deduplicator.process"]], "process() (data_juicer.ops.base_op.filter method)": [[5, "data_juicer.ops.base_op.Filter.process"]], "process() (data_juicer.ops.base_op.mapper method)": [[5, "data_juicer.ops.base_op.Mapper.process"]], "process() (data_juicer.ops.base_op.selector method)": [[5, "data_juicer.ops.base_op.Selector.process"]], "process() (data_juicer.ops.op_fusion.fusedfilter method)": [[5, "data_juicer.ops.op_fusion.FusedFilter.process"]], "unionfind (class in data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.UnionFind"]], "data_juicer.ops.common.helper_func": [[6, "module-data_juicer.ops.common.helper_func"]], "data_juicer.ops.common.special_characters": [[6, "module-data_juicer.ops.common.special_characters"]], "find() (data_juicer.ops.common.helper_func.unionfind method)": [[6, "data_juicer.ops.common.helper_func.UnionFind.find"]], "get_sentences_from_document() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.split_on_whitespace"]], "strip() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.strip"]], "union() (data_juicer.ops.common.helper_func.unionfind method)": [[6, "data_juicer.ops.common.helper_func.UnionFind.union"]], "words_augmentation() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator.document_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator.document_minhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator.document_simhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator"]], "compute_hash() (data_juicer.ops.deduplicator.document_deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.document_minhash_deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.document_simhash_deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "data_juicer.ops.deduplicator.document_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_deduplicator"]], "data_juicer.ops.deduplicator.document_minhash_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_minhash_deduplicator"]], "data_juicer.ops.deduplicator.document_simhash_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_simhash_deduplicator"]], "local_num_differing_bits() (in module data_juicer.ops.deduplicator.document_simhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.local_num_differing_bits"]], "num_differing_bits_selector() (in module data_juicer.ops.deduplicator.document_simhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.num_differing_bits_selector"]], "optimal_param() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.optimal_param"]], "process() (data_juicer.ops.deduplicator.document_deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.document_minhash_deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.document_simhash_deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator.process"]], "sha1_hash32() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.sha1_hash32"]], "alphanumericfilter (class in data_juicer.ops.filter.alphanumeric_filter)": [[8, "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter.average_line_length_filter)": [[8, "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter.character_repetition_filter)": [[8, "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter.flagged_words_filter)": [[8, "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter"]], "languageidscorefilter (class in data_juicer.ops.filter.language_id_score_filter)": [[8, "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter.maximum_line_length_filter)": [[8, "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter.perplexity_filter)": [[8, "data_juicer.ops.filter.perplexity_filter.PerplexityFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter.special_characters_filter)": [[8, "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter.specified_field_filter)": [[8, "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter.specified_numeric_field_filter)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter.stopwords_filter)": [[8, "data_juicer.ops.filter.stopwords_filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter.suffix_filter)": [[8, "data_juicer.ops.filter.suffix_filter.SuffixFilter"]], "textlengthfilter (class in data_juicer.ops.filter.text_length_filter)": [[8, "data_juicer.ops.filter.text_length_filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter.token_num_filter)": [[8, "data_juicer.ops.filter.token_num_filter.TokenNumFilter"]], "wordnumfilter (class in data_juicer.ops.filter.word_num_filter)": [[8, "data_juicer.ops.filter.word_num_filter.WordNumFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter.word_repetition_filter)": [[8, "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter"]], "compute_stats() (data_juicer.ops.filter.alphanumeric_filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.average_line_length_filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.character_repetition_filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.flagged_words_filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.language_id_score_filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.maximum_line_length_filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.perplexity_filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.perplexity_filter.PerplexityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.special_characters_filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specified_field_filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specified_numeric_field_filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.stopwords_filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.stopwords_filter.StopWordsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.suffix_filter.suffixfilter method)": [[8, "data_juicer.ops.filter.suffix_filter.SuffixFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.text_length_filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.text_length_filter.TextLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.token_num_filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.token_num_filter.TokenNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.word_num_filter.wordnumfilter method)": [[8, "data_juicer.ops.filter.word_num_filter.WordNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.word_repetition_filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter.compute_stats"]], "data_juicer.ops.filter.alphanumeric_filter": [[8, "module-data_juicer.ops.filter.alphanumeric_filter"]], "data_juicer.ops.filter.average_line_length_filter": [[8, "module-data_juicer.ops.filter.average_line_length_filter"]], "data_juicer.ops.filter.character_repetition_filter": [[8, "module-data_juicer.ops.filter.character_repetition_filter"]], "data_juicer.ops.filter.flagged_words_filter": [[8, "module-data_juicer.ops.filter.flagged_words_filter"]], "data_juicer.ops.filter.language_id_score_filter": [[8, "module-data_juicer.ops.filter.language_id_score_filter"]], "data_juicer.ops.filter.maximum_line_length_filter": [[8, "module-data_juicer.ops.filter.maximum_line_length_filter"]], "data_juicer.ops.filter.perplexity_filter": [[8, "module-data_juicer.ops.filter.perplexity_filter"]], "data_juicer.ops.filter.special_characters_filter": [[8, "module-data_juicer.ops.filter.special_characters_filter"]], "data_juicer.ops.filter.specified_field_filter": [[8, "module-data_juicer.ops.filter.specified_field_filter"]], "data_juicer.ops.filter.specified_numeric_field_filter": [[8, "module-data_juicer.ops.filter.specified_numeric_field_filter"]], "data_juicer.ops.filter.stopwords_filter": [[8, "module-data_juicer.ops.filter.stopwords_filter"]], "data_juicer.ops.filter.suffix_filter": [[8, "module-data_juicer.ops.filter.suffix_filter"]], "data_juicer.ops.filter.text_length_filter": [[8, "module-data_juicer.ops.filter.text_length_filter"]], "data_juicer.ops.filter.token_num_filter": [[8, "module-data_juicer.ops.filter.token_num_filter"]], "data_juicer.ops.filter.word_num_filter": [[8, "module-data_juicer.ops.filter.word_num_filter"]], "data_juicer.ops.filter.word_repetition_filter": [[8, "module-data_juicer.ops.filter.word_repetition_filter"]], "is_number() (in module data_juicer.ops.filter.specified_numeric_field_filter)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.is_number"]], "process() (data_juicer.ops.filter.alphanumeric_filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter.process"]], "process() (data_juicer.ops.filter.average_line_length_filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter.process"]], "process() (data_juicer.ops.filter.character_repetition_filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter.process"]], "process() (data_juicer.ops.filter.flagged_words_filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter.process"]], "process() (data_juicer.ops.filter.language_id_score_filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter.process"]], "process() (data_juicer.ops.filter.maximum_line_length_filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter.process"]], "process() (data_juicer.ops.filter.perplexity_filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.perplexity_filter.PerplexityFilter.process"]], "process() (data_juicer.ops.filter.special_characters_filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter.process"]], "process() (data_juicer.ops.filter.specified_field_filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter.process"]], "process() (data_juicer.ops.filter.specified_numeric_field_filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter.process"]], "process() (data_juicer.ops.filter.stopwords_filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.stopwords_filter.StopWordsFilter.process"]], "process() (data_juicer.ops.filter.suffix_filter.suffixfilter method)": [[8, "data_juicer.ops.filter.suffix_filter.SuffixFilter.process"]], "process() (data_juicer.ops.filter.text_length_filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.text_length_filter.TextLengthFilter.process"]], "process() (data_juicer.ops.filter.token_num_filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.token_num_filter.TokenNumFilter.process"]], "process() (data_juicer.ops.filter.word_num_filter.wordnumfilter method)": [[8, "data_juicer.ops.filter.word_num_filter.WordNumFilter.process"]], "process() (data_juicer.ops.filter.word_repetition_filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter.process"]], "cleancopyrightmapper (class in data_juicer.ops.mapper.clean_copyright_mapper)": [[9, "data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper.clean_email_mapper)": [[9, "data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper.clean_html_mapper)": [[9, "data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper.clean_ip_mapper)": [[9, "data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper.clean_links_mapper)": [[9, "data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper.expand_macro_mapper)": [[9, "data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper.fix_unicode_mapper)": [[9, "data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper.nlpaug_en_mapper)": [[9, "data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper.nlpcda_zh_mapper)": [[9, "data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper.punctuation_normalization_mapper)": [[9, "data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper.remove_bibliography_mapper)": [[9, "data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper.remove_comments_mapper)": [[9, "data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper.remove_header_mapper)": [[9, "data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper.remove_long_words_mapper)": [[9, "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper.remove_specific_chars_mapper)": [[9, "data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper.remove_table_text_mapper)": [[9, "data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper)": [[9, "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "sentencesplitmapper (class in data_juicer.ops.mapper.sentence_split_mapper)": [[9, "data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper.whitespace_normalization_mapper)": [[9, "data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.clean_copyright_mapper": [[9, "module-data_juicer.ops.mapper.clean_copyright_mapper"]], "data_juicer.ops.mapper.clean_email_mapper": [[9, "module-data_juicer.ops.mapper.clean_email_mapper"]], "data_juicer.ops.mapper.clean_html_mapper": [[9, "module-data_juicer.ops.mapper.clean_html_mapper"]], "data_juicer.ops.mapper.clean_ip_mapper": [[9, "module-data_juicer.ops.mapper.clean_ip_mapper"]], "data_juicer.ops.mapper.clean_links_mapper": [[9, "module-data_juicer.ops.mapper.clean_links_mapper"]], "data_juicer.ops.mapper.expand_macro_mapper": [[9, "module-data_juicer.ops.mapper.expand_macro_mapper"]], "data_juicer.ops.mapper.fix_unicode_mapper": [[9, "module-data_juicer.ops.mapper.fix_unicode_mapper"]], "data_juicer.ops.mapper.nlpaug_en_mapper": [[9, "module-data_juicer.ops.mapper.nlpaug_en_mapper"]], "data_juicer.ops.mapper.nlpcda_zh_mapper": [[9, "module-data_juicer.ops.mapper.nlpcda_zh_mapper"]], "data_juicer.ops.mapper.punctuation_normalization_mapper": [[9, "module-data_juicer.ops.mapper.punctuation_normalization_mapper"]], "data_juicer.ops.mapper.remove_bibliography_mapper": [[9, "module-data_juicer.ops.mapper.remove_bibliography_mapper"]], "data_juicer.ops.mapper.remove_comments_mapper": [[9, "module-data_juicer.ops.mapper.remove_comments_mapper"]], "data_juicer.ops.mapper.remove_header_mapper": [[9, "module-data_juicer.ops.mapper.remove_header_mapper"]], "data_juicer.ops.mapper.remove_long_words_mapper": [[9, "module-data_juicer.ops.mapper.remove_long_words_mapper"]], "data_juicer.ops.mapper.remove_specific_chars_mapper": [[9, "module-data_juicer.ops.mapper.remove_specific_chars_mapper"]], "data_juicer.ops.mapper.remove_table_text_mapper": [[9, "module-data_juicer.ops.mapper.remove_table_text_mapper"]], "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper": [[9, "module-data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper"]], "data_juicer.ops.mapper.sentence_split_mapper": [[9, "module-data_juicer.ops.mapper.sentence_split_mapper"]], "data_juicer.ops.mapper.whitespace_normalization_mapper": [[9, "module-data_juicer.ops.mapper.whitespace_normalization_mapper"]], "process() (data_juicer.ops.mapper.clean_copyright_mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper.process"]], "process() (data_juicer.ops.mapper.clean_email_mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper.process"]], "process() (data_juicer.ops.mapper.clean_html_mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper.process"]], "process() (data_juicer.ops.mapper.clean_ip_mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper.process"]], "process() (data_juicer.ops.mapper.clean_links_mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper.process"]], "process() (data_juicer.ops.mapper.expand_macro_mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper.process"]], "process() (data_juicer.ops.mapper.fix_unicode_mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper.process"]], "process() (data_juicer.ops.mapper.nlpaug_en_mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper.process"]], "process() (data_juicer.ops.mapper.nlpcda_zh_mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper.process"]], "process() (data_juicer.ops.mapper.punctuation_normalization_mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper.process"]], "process() (data_juicer.ops.mapper.remove_bibliography_mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper.process"]], "process() (data_juicer.ops.mapper.remove_comments_mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper.process"]], "process() (data_juicer.ops.mapper.remove_header_mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper.process"]], "process() (data_juicer.ops.mapper.remove_long_words_mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper.process"]], "process() (data_juicer.ops.mapper.remove_specific_chars_mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper.process"]], "process() (data_juicer.ops.mapper.remove_table_text_mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper.process"]], "process() (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper.process"]], "process() (data_juicer.ops.mapper.sentence_split_mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper.process"]], "process() (data_juicer.ops.mapper.whitespace_normalization_mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper.process"]], "should_keep_long_word() (data_juicer.ops.mapper.remove_long_words_mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector.frequency_specified_field_selector)": [[10, "data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector.topk_specified_field_selector)": [[10, "data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.frequency_specified_field_selector": [[10, "module-data_juicer.ops.selector.frequency_specified_field_selector"]], "data_juicer.ops.selector.topk_specified_field_selector": [[10, "module-data_juicer.ops.selector.topk_specified_field_selector"]], "process() (data_juicer.ops.selector.frequency_specified_field_selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topk_specified_field_selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector.process"]], "to_number() (in module data_juicer.ops.selector.topk_specified_field_selector)": [[10, "data_juicer.ops.selector.topk_specified_field_selector.to_number"]], "basecompressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.BaseCompressor"]], "cachecompressmanager (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.CacheCompressManager"]], "checkpointmanager (class in data_juicer.utils.ckpt_utils)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager"]], "compressmanager (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.CompressManager"]], "compressionoff (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.CompressionOff"]], "compressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.Compressor"]], "extractor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.Extractor"]], "fields (class in data_juicer.utils.constant)": [[12, "data_juicer.utils.constant.Fields"]], "filelock (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.FileLock"]], "gzipcompressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.GzipCompressor"]], "hashkeys (class in data_juicer.utils.constant)": [[12, "data_juicer.utils.constant.HashKeys"]], "hasher (class in data_juicer.utils.fingerprint_utils)": [[12, "data_juicer.utils.fingerprint_utils.Hasher"]], "hiddenprints (class in data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.HiddenPrints"]], "intervars (class in data_juicer.utils.constant)": [[12, "data_juicer.utils.constant.InterVars"]], "lz4compressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.Lz4Compressor"]], "registry (class in data_juicer.utils.registry)": [[12, "data_juicer.utils.registry.Registry"]], "statskeys (class in data_juicer.utils.constant)": [[12, "data_juicer.utils.constant.StatsKeys"]], "streamtologuru (class in data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.StreamToLoguru"]], "zstdcompressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.ZstdCompressor"]], "alnum_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.alnum_ratio"]], "alpha_token_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.alpha_token_ratio"]], "avg_line_length (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.avg_line_length"]], "char_rep_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.char_rep_ratio"]], "check_ckpt() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.check_ckpt"]], "check_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.check_model"]], "check_ops_to_skip() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.check_ops_to_skip"]], "cleanup_cache_files() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.cleanup_cache_files"]], "cleanup_compressed_cache_files() (in module data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.cleanup_compressed_cache_files"]], "compress() (data_juicer.utils.compress.basecompressor static method)": [[12, "data_juicer.utils.compress.BaseCompressor.compress"]], "compress() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.compress"]], "compress() (data_juicer.utils.compress.compressmanager method)": [[12, "data_juicer.utils.compress.CompressManager.compress"]], "compress() (data_juicer.utils.compress.compressor class method)": [[12, "data_juicer.utils.compress.Compressor.compress"]], "compress() (data_juicer.utils.compress.gzipcompressor static method)": [[12, "data_juicer.utils.compress.GzipCompressor.compress"]], "compress() (data_juicer.utils.compress.lz4compressor static method)": [[12, "data_juicer.utils.compress.Lz4Compressor.compress"]], "compress() (data_juicer.utils.compress.zstdcompressor static method)": [[12, "data_juicer.utils.compress.ZstdCompressor.compress"]], "compress() (in module data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.compress"]], "compressors (data_juicer.utils.compress.compressor attribute)": [[12, "data_juicer.utils.compress.Compressor.compressors"]], "context (data_juicer.utils.constant.fields attribute)": [[12, "data_juicer.utils.constant.Fields.context"]], "data_juicer.utils.asset_utils": [[12, "module-data_juicer.utils.asset_utils"]], "data_juicer.utils.cache_utils": [[12, "module-data_juicer.utils.cache_utils"]], "data_juicer.utils.ckpt_utils": [[12, "module-data_juicer.utils.ckpt_utils"]], "data_juicer.utils.compress": [[12, "module-data_juicer.utils.compress"]], "data_juicer.utils.constant": [[12, "module-data_juicer.utils.constant"]], "data_juicer.utils.file_utils": [[12, "module-data_juicer.utils.file_utils"]], "data_juicer.utils.fingerprint_utils": [[12, "module-data_juicer.utils.fingerprint_utils"]], "data_juicer.utils.logger_utils": [[12, "module-data_juicer.utils.logger_utils"]], "data_juicer.utils.model_utils": [[12, "module-data_juicer.utils.model_utils"]], "data_juicer.utils.registry": [[12, "module-data_juicer.utils.registry"]], "decompress() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.decompress"]], "decompress() (data_juicer.utils.compress.compressmanager method)": [[12, "data_juicer.utils.compress.CompressManager.decompress"]], "decompress() (in module data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.decompress"]], "dispatch (data_juicer.utils.fingerprint_utils.hasher attribute)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.dispatch"]], "extract() (data_juicer.utils.compress.extractor class method)": [[12, "data_juicer.utils.compress.Extractor.extract"]], "find_files_with_suffix() (in module data_juicer.utils.file_utils)": [[12, "data_juicer.utils.file_utils.find_files_with_suffix"]], "flagged_words_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.flagged_words_ratio"]], "flush() (data_juicer.utils.logger_utils.streamtologuru method)": [[12, "data_juicer.utils.logger_utils.StreamToLoguru.flush"]], "format_cache_file_name() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.format_cache_file_name"]], "generate_fingerprint() (in module data_juicer.utils.fingerprint_utils)": [[12, "data_juicer.utils.fingerprint_utils.generate_fingerprint"]], "get() (data_juicer.utils.registry.registry method)": [[12, "data_juicer.utils.registry.Registry.get"]], "get_caller_name() (in module data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.get_caller_name"]], "get_left_process_list() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.get_left_process_list"]], "get_log_file_path() (in module data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.get_log_file_path"]], "get_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.get_model"]], "hash (data_juicer.utils.constant.hashkeys attribute)": [[12, "data_juicer.utils.constant.HashKeys.hash"]], "hash() (data_juicer.utils.fingerprint_utils.hasher class method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.hash"]], "hash_bytes() (data_juicer.utils.fingerprint_utils.hasher class method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.hash_bytes"]], "hash_default() (data_juicer.utils.fingerprint_utils.hasher class method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.hash_default"]], "hexdigest() (data_juicer.utils.fingerprint_utils.hasher method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.hexdigest"]], "is_absolute_path() (in module data_juicer.utils.file_utils)": [[12, "data_juicer.utils.file_utils.is_absolute_path"]], "lang (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.lang"]], "lang_score (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.lang_score"]], "lines (data_juicer.utils.constant.intervars attribute)": [[12, "data_juicer.utils.constant.InterVars.lines"]], "list() (data_juicer.utils.registry.registry method)": [[12, "data_juicer.utils.registry.Registry.list"]], "load_ckpt() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.load_ckpt"]], "load_words_asset() (in module data_juicer.utils.asset_utils)": [[12, "data_juicer.utils.asset_utils.load_words_asset"]], "max_line_length (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.max_line_length"]], "meta (data_juicer.utils.constant.fields attribute)": [[12, "data_juicer.utils.constant.Fields.meta"]], "minhash (data_juicer.utils.constant.hashkeys attribute)": [[12, "data_juicer.utils.constant.HashKeys.minhash"]], "modules (data_juicer.utils.registry.registry property)": [[12, "data_juicer.utils.registry.Registry.modules"]], "name (data_juicer.utils.registry.registry property)": [[12, "data_juicer.utils.registry.Registry.name"]], "num_token (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.num_token"]], "num_words (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.num_words"]], "perplexity (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.perplexity"]], "prepare_diversity_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_diversity_model"]], "prepare_fasttext_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_fasttext_model"]], "prepare_huggingface_tokenizer() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_huggingface_tokenizer"]], "prepare_kenlm_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_kenlm_model"]], "prepare_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_model"]], "prepare_nltk_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_nltk_model"]], "prepare_sentencepiece_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_sentencepiece_model"]], "record() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.record"]], "redirect_sys_output() (in module data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.redirect_sys_output"]], "refined_words (data_juicer.utils.constant.intervars attribute)": [[12, "data_juicer.utils.constant.InterVars.refined_words"]], "register_module() (data_juicer.utils.registry.registry method)": [[12, "data_juicer.utils.registry.Registry.register_module"]], "save_ckpt() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.save_ckpt"]], "setup_logger() (in module data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.setup_logger"]], "simhash (data_juicer.utils.constant.hashkeys attribute)": [[12, "data_juicer.utils.constant.HashKeys.simhash"]], "special_char_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.special_char_ratio"]], "stats (data_juicer.utils.constant.fields attribute)": [[12, "data_juicer.utils.constant.Fields.stats"]], "stopwords_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.stopwords_ratio"]], "suffix (data_juicer.utils.constant.fields attribute)": [[12, "data_juicer.utils.constant.Fields.suffix"]], "text_len (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.text_len"]], "update() (data_juicer.utils.fingerprint_utils.hasher method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.update"]], "update_fingerprint() (in module data_juicer.utils.fingerprint_utils)": [[12, "data_juicer.utils.fingerprint_utils.update_fingerprint"]], "word_rep_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.word_rep_ratio"]], "words (data_juicer.utils.constant.intervars attribute)": [[12, "data_juicer.utils.constant.InterVars.words"]], "write() (data_juicer.utils.logger_utils.streamtologuru method)": [[12, "data_juicer.utils.logger_utils.StreamToLoguru.write"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["d a t a _ j u i c e r", "d a t a _ j u i c e r . a n a l y s i s", "d a t a _ j u i c e r . c o n f i g", "d a t a _ j u i c e r . c o r e", "d a t a _ j u i c e r . f o r m a t", "d a t a _ j u i c e r . o p s", "d a t a _ j u i c e r . o p s . c o m m o n", "d a t a _ j u i c e r . o p s . d e d u p l i c a t o r", "d a t a _ j u i c e r . o p s . f i l t e r", "d a t a _ j u i c e r . o p s . m a p p e r", "d a t a _ j u i c e r . o p s . s e l e c t o r", "d a t a _ j u i c e r . t o o l s", "d a t a _ j u i c e r . u t i l s", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"n": [0, 5, 8, 13, 14], "l": [0, 5, 13, 14], "y": [0, 6, 13, 14], "": [0, 3, 4, 14], "data_juic": 0, "analysi": [0, 3, 13, 14], "column_wise_analysi": [0, 13, 14], "columnwiseanalysi": [0, 1, 3, 13, 14], "__init__": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12], "analys": [0, 1, 2, 13, 14], "draw_box": [0, 1], "draw_hist": [0, 1], "get_row_col": [0, 1, 13, 14], "diversity_analysi": [0, 13, 14], "diversityanalysi": [0, 1, 13, 14], "comput": [0, 1, 3, 5, 6, 7, 8, 12], "find_root_verb_and_its_dobj": [0, 1, 13, 14], "find_root_verb_and_its_dobj_in_str": [0, 1, 13, 14], "get_divers": [0, 1, 13, 14], "overall_analysi": [0, 13, 14], "overallanalysi": [0, 1, 3, 13, 14], "o": [0, 12, 13, 14], "f": [0, 3, 5, 13, 14], "g": [0, 3, 4, 9, 13, 14], "config": [0, 3, 5, 12, 13, 14], "config_backup": [0, 2, 13, 14], "display_config": [0, 2, 13, 14], "init_config": [0, 2, 13, 14], "init_setup_from_cfg": [0, 2, 13, 14], "sort_op_by_types_and_nam": [0, 2, 13, 14], "core": [0, 13, 14], "run": [0, 3], "data": [0, 1, 4, 7, 14], "nesteddataset": [0, 3, 13, 14], "add_column": [0, 3], "cleanup_cache_fil": [0, 3, 12], "filter": [0, 3, 4, 5, 9, 13, 14], "from_dict": [0, 3], "map": [0, 3, 4], "remove_column": [0, 3], "select": [0, 3, 4, 5, 7, 10], "select_column": [0, 3], "nesteddatasetdict": [0, 3, 13, 14], "nestedquerydict": [0, 3, 13, 14], "nested_obj_factori": [0, 3, 13, 14], "nested_queri": [0, 3, 13, 14], "wrap_func_with_nested_access": [0, 3, 13, 14], "executor": [0, 2, 13, 14], "export": [0, 4, 13, 14], "gib": [0, 3], "kib": [0, 3], "mib": [0, 3], "tib": [0, 3], "to_jsonl": [0, 3], "to_parquet": [0, 3], "ray_executor": [0, 13, 14], "rayexecutor": [0, 3, 13, 14], "tracer": [0, 5, 7, 13, 14], "trace_batch_mapp": [0, 3], "trace_dedupl": [0, 3], "trace_filt": [0, 3], "trace_mapp": [0, 3], "m": [0, 5, 13, 14], "format": [0, 3, 8, 12, 13, 14], "csv_formatt": [0, 13, 14], "csvformatt": [0, 4, 13, 14], "suffix": [0, 4, 8, 12], "formatt": [0, 13, 14], "baseformatt": [0, 4, 13, 14], "load_dataset": [0, 4], "localformatt": [0, 4, 13, 14], "remoteformatt": [0, 4, 13, 14], "add_suffix": [0, 4, 13, 14], "load_formatt": [0, 4, 13, 14], "unify_format": [0, 4, 13, 14], "json_formatt": [0, 13, 14], "jsonformatt": [0, 4, 13, 14], "load": [0, 1, 3, 12, 13, 14], "mixture_formatt": [0, 13, 14], "mixtureformatt": [0, 4, 13, 14], "parquet_formatt": [0, 13, 14], "parquetformatt": [0, 4, 13, 14], "text_formatt": [0, 13, 14], "textformatt": [0, 4, 12, 13, 14], "extract_txt_from_docx": [0, 4, 13, 14], "extract_txt_from_pdf": [0, 4, 13, 14], "tsv_formatt": [0, 13, 14], "tsvformatt": [0, 4, 13, 14], "p": [0, 13, 14], "op": [0, 2, 3, 12, 13, 14], "common": [0, 3, 5, 13, 14], "helper_func": [0, 5, 13, 14], "unionfind": [0, 5, 6], "get_sentences_from_docu": [0, 5, 6], "get_words_from_docu": [0, 5, 6], "merge_on_whitespace_tab_newlin": [0, 5, 6], "split_on_newline_tab_whitespac": [0, 5, 6], "split_on_whitespac": [0, 5, 6], "strip": [0, 5, 6], "words_augment": [0, 5, 6], "words_refin": [0, 5, 6], "special_charact": [0, 5, 13, 14], "dedupl": [0, 3, 5, 13, 14], "document_dedupl": [0, 5, 13, 14], "documentdedupl": [0, 5, 7], "document_minhash_dedupl": [0, 5, 13, 14], "documentminhashdedupl": [0, 5, 7], "optimal_param": [0, 5, 7], "sha1_hash32": [0, 5, 7], "document_simhash_dedupl": [0, 5, 13, 14], "documentsimhashdedupl": [0, 5, 7], "local_num_differing_bit": [0, 5, 7], "num_differing_bits_selector": [0, 5, 7], "alphanumeric_filt": [0, 5, 13, 14], "alphanumericfilt": [0, 5, 8], "average_line_length_filt": [0, 5, 13, 14], "averagelinelengthfilt": [0, 5, 8], "character_repetition_filt": [0, 5, 13, 14], "characterrepetitionfilt": [0, 5, 8], "flagged_words_filt": [0, 5, 13, 14], "flaggedwordfilt": [0, 5, 8], "language_id_score_filt": [0, 5, 13, 14], "languageidscorefilt": [0, 5, 8], "maximum_line_length_filt": [0, 5, 13, 14], "maximumlinelengthfilt": [0, 5, 8], "perplexity_filt": [0, 5, 13, 14], "perplexityfilt": [0, 5, 8], "special_characters_filt": [0, 5, 13, 14], "specialcharactersfilt": [0, 5, 8], "specified_field_filt": [0, 5, 13, 14], "specifiedfieldfilt": [0, 5, 8], "specified_numeric_field_filt": [0, 5, 13, 14], "specifiednumericfieldfilt": [0, 5, 8], "is_numb": [0, 5, 8], "stopwords_filt": [0, 5, 13, 14], "stopwordsfilt": [0, 5, 8], "suffix_filt": [0, 5, 13, 14], "suffixfilt": [0, 5, 8], "text_length_filt": [0, 5, 13, 14], "textlengthfilt": [0, 5, 8], "token_num_filt": [0, 5, 13, 14], "tokennumfilt": [0, 5, 8], "word_num_filt": [0, 5, 13, 14], "wordnumfilt": [0, 5, 8], "word_repetition_filt": [0, 5, 13, 14], "wordrepetitionfilt": [0, 5, 8], "mapper": [0, 3, 5, 13, 14], "clean_copyright_mapp": [0, 5, 13, 14], "cleancopyrightmapp": [0, 5, 9], "clean_email_mapp": [0, 5, 13, 14], "cleanemailmapp": [0, 5, 9], "clean_html_mapp": [0, 5, 13, 14], "cleanhtmlmapp": [0, 5, 9], "clean_ip_mapp": [0, 5, 13, 14], "cleanipmapp": [0, 5, 9], "clean_links_mapp": [0, 5, 13, 14], "cleanlinksmapp": [0, 5, 9], "expand_macro_mapp": [0, 5, 13, 14], "expandmacromapp": [0, 5, 9], "fix_unicode_mapp": [0, 5, 13, 14], "fixunicodemapp": [0, 5, 9], "nlpaug_en_mapp": [0, 5, 13, 14], "nlpaugenmapp": [0, 5, 9], "nlpcda_zh_mapp": [0, 5, 13, 14], "nlpcdazhmapp": [0, 5, 9], "punctuation_normalization_mapp": [0, 5, 13, 14], "punctuationnormalizationmapp": [0, 5, 9], "remove_bibliography_mapp": [0, 5, 13, 14], "removebibliographymapp": [0, 5, 9], "remove_comments_mapp": [0, 5, 13, 14], "removecommentsmapp": [0, 5, 9], "remove_header_mapp": [0, 5, 13, 14], "removeheadermapp": [0, 5, 9], "remove_long_words_mapp": [0, 5, 13, 14], "removelongwordsmapp": [0, 5, 9], "remove_specific_chars_mapp": [0, 5, 13, 14], "removespecificcharsmapp": [0, 5, 9], "remove_table_text_mapp": [0, 5, 13, 14], "removetabletextmapp": [0, 5, 9], "remove_words_with_incorrect_substrings_mapp": [0, 5, 13, 14], "removewordswithincorrectsubstringsmapp": [0, 5, 9], "sentence_split_mapp": [0, 5, 13, 14], "sentencesplitmapp": [0, 5, 9], "whitespace_normalization_mapp": [0, 5, 13, 14], "whitespacenormalizationmapp": [0, 5, 9], "selector": [0, 5, 13, 14], "frequency_specified_field_selector": [0, 5, 13, 14], "frequencyspecifiedfieldselector": [0, 5, 10], "topk_specified_field_selector": [0, 5, 13, 14], "topkspecifiedfieldselector": [0, 5, 10], "to_numb": [0, 5, 10], "base_op": [0, 13, 14], "compute_hash": [0, 5, 7], "process": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12], "compute_stat": [0, 5, 8], "is_batched_op": [0, 5], "load_op": [0, 5, 13, 14], "op_fus": [0, 13, 14], "fusedfilt": [0, 5, 13, 14], "fuse_filter_group": [0, 5, 13, 14], "fuse_oper": [0, 5, 13, 14], "util": [0, 13, 14], "asset_util": [0, 13, 14], "load_words_asset": [0, 12, 13, 14], "cache_util": [0, 13, 14], "ckpt_util": [0, 13, 14], "checkpointmanag": [0, 12, 13, 14], "check_ckpt": [0, 12], "check_ops_to_skip": [0, 12], "get_left_process_list": [0, 12], "load_ckpt": [0, 12], "record": [0, 12], "save_ckpt": [0, 12], "compress": [0, 3, 13, 14], "basecompressor": [0, 12, 13, 14], "cachecompressmanag": [0, 12, 13, 14], "decompress": [0, 12, 13, 14], "format_cache_file_nam": [0, 12], "compressmanag": [0, 12, 13, 14], "compressionoff": [0, 12, 13, 14], "compressor": [0, 12, 13, 14], "extractor": [0, 12, 13, 14], "extract": [0, 3, 4, 12], "filelock": [0, 12, 13, 14], "gzipcompressor": [0, 12, 13, 14], "lz4compressor": [0, 12, 13, 14], "zstdcompressor": [0, 12, 13, 14], "cleanup_compressed_cache_fil": [0, 12, 13, 14], "constant": [0, 13, 14], "field": [0, 4, 5, 8, 10, 12, 13, 14], "context": [0, 5, 8, 12], "meta": [0, 3, 4, 12], "stat": [0, 1, 3, 5, 8, 12], "hashkei": [0, 12, 13, 14], "hash": [0, 5, 7, 12], "minhash": [0, 7, 12], "simhash": [0, 7, 12], "intervar": [0, 12, 13, 14], "line": [0, 1, 2, 8, 12], "refined_word": [0, 12], "word": [0, 6, 8, 9, 12], "statskei": [0, 12, 13, 14], "alnum_ratio": [0, 12], "alpha_token_ratio": [0, 12], "avg_line_length": [0, 12], "char_rep_ratio": [0, 12], "flagged_words_ratio": [0, 12], "lang": [0, 8, 9, 12], "lang_scor": [0, 12], "max_line_length": [0, 12], "num_token": [0, 12], "num_word": [0, 12], "perplex": [0, 8, 12], "special_char_ratio": [0, 12], "stopwords_ratio": [0, 12], "text_len": [0, 12], "word_rep_ratio": [0, 12], "file_util": [0, 13, 14], "find_files_with_suffix": [0, 12, 13, 14], "is_absolute_path": [0, 12, 13, 14], "fingerprint_util": [0, 13, 14], "hasher": [0, 12, 13, 14], "dispatch": [0, 12], "hash_byt": [0, 12], "hash_default": [0, 12], "hexdigest": [0, 12], "updat": [0, 2, 12], "generate_fingerprint": [0, 12, 13, 14], "update_fingerprint": [0, 12, 13, 14], "logger_util": [0, 13, 14], "hiddenprint": [0, 12, 13, 14], "streamtologuru": [0, 12, 13, 14], "flush": [0, 12], "write": [0, 12], "get_caller_nam": [0, 12, 13, 14], "get_log_file_path": [0, 12, 13, 14], "redirect_sys_output": [0, 12, 13, 14], "setup_logg": [0, 12, 13, 14], "model_util": [0, 13, 14], "check_model": [0, 12, 13, 14], "get_model": [0, 12, 13, 14], "prepare_diversity_model": [0, 12, 13, 14], "prepare_fasttext_model": [0, 12, 13, 14], "prepare_huggingface_token": [0, 12, 13, 14], "prepare_kenlm_model": [0, 12, 13, 14], "prepare_model": [0, 12, 13, 14], "prepare_nltk_model": [0, 12, 13, 14], "prepare_sentencepiece_model": [0, 12, 13, 14], "registri": [0, 13, 14], "get": [0, 1, 4, 6, 12], "list": [0, 1, 2, 3, 4, 5, 6, 8, 9, 12], "modul": [0, 2, 4, 12, 13], "name": [0, 1, 2, 3, 4, 5, 8, 12], "register_modul": [0, 12], "class": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12], "dataset": [1, 3, 4, 5, 7, 10, 12], "output_path": [1, 12], "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 9, 10, 12], "save_stats_in_one_fil": 1, "true": [1, 3, 5, 6, 7, 8, 9, 10, 12], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], "base": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12], "object": [1, 2, 3, 4, 5, 6, 12], "appli": [1, 3, 10], "each": [1, 2, 3, 5, 7, 9], "column": [1, 3, 9], "respect": [1, 7], "initi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], "method": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12], "param": [1, 2, 6, 7, 12], "path": [1, 3, 4, 12], "store": [1, 3, 4, 5, 7, 8, 12], "result": [1, 3], "option": [1, 3, 4, 12], "precomput": 1, "overal": 1, "whether": [1, 3, 4, 5, 6, 7, 8, 9, 12], "save": [1, 4, 12], "all": [1, 3, 6, 8, 9, 12], "figur": [1, 3], "one": [1, 2, 6, 7, 8, 12], "imag": 1, "file": [1, 2, 3, 4, 5, 8, 12], "show_percentil": 1, "fals": [1, 4, 5, 6, 7, 8, 9, 12], "show": [1, 3], "draw": 1, "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], "percentil": 1, "sub": [1, 2, 6, 7, 12], "If": [1, 8, 9, 12], "sever": 1, "red": 1, "indic": 1, "quantil": 1, "distribut": [1, 3], "singl": [1, 3, 5], "window": [1, 7], "after": [1, 2, 3, 4, 6, 7], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12], "ax": 1, "save_path": 1, "box": 1, "plot": 1, "includ": [1, 5, 7, 8, 9], "inform": [1, 5, 8, 10], "histogram": 1, "total_num": 1, "factor": 1, "2": [1, 3, 6, 8, 9], "given": [1, 3], "total": [1, 8, 9], "number": [1, 3, 4, 5, 7, 8, 9, 10, 12], "best": 1, "row": [1, 7], "thi": [1, 3, 5, 6, 7, 8, 9, 10, 12], "function": [1, 3, 6, 7], "need": [1, 6, 8, 9, 10, 12], "when": [1, 3, 4, 5, 7, 10, 12], "we": [1, 7, 8, 9, 12], "type": [1, 2, 4, 7, 9, 12], "In": [1, 3], "default": [1, 2, 3, 4, 7, 9, 12], "which": [1, 3, 5, 8, 9, 12], "mean": [1, 3, 12], "ar": [1, 3, 6, 7, 10, 12], "grid": 1, "lang_or_model": 1, "en": [1, 8, 9, 12], "divers": [1, 12], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "an": [1, 3, 4, 5, 7, 12], "model": [1, 6, 8, 9, 12], "specif": [1, 3, 5, 7, 8, 9, 12], "languag": [1, 7, 8, 9, 12], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 12], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "postproc_func": 1, "postproc_kwarg": 1, "whole": 1, "argument": [1, 3, 5], "lexic": 1, "tree": 1, "tree_root": 1, "find": [1, 3, 5, 6, 12], "verb": 1, "its": [1, 3, 4, 5, 9, 12], "closest": 1, "root": 1, "valid": [1, 12], "nlp": 1, "first_sent": 1, "input": [1, 3, 4, 5, 7, 8, 10, 12], "string": [1, 7, 9, 12], "first": [1, 3, 6, 7], "sentenc": [1, 6, 9], "onli": [1, 3, 7, 9, 12], "matter": 1, "over": 1, "top_k_verb": 1, "20": [1, 9], "top_k_noun": 1, "4": [1, 7], "kwarg": [1, 3, 4, 7, 8, 9, 10, 12], "keep": [1, 5, 7, 8, 12], "largest": 1, "group": [1, 5, 6, 8], "noun": 1, "extra": [1, 2, 3, 4, 7, 8, 9, 10, 12], "arg": [1, 2, 3, 4, 5, 7, 8, 9, 10, 12], "std": 1, "etc": [1, 3, 4, 12], "describ": 1, "panda": 1, "cfg": [2, 3, 4], "jsonargpars": 2, "parser": 2, "pars": 2, "from": [2, 3, 4, 5, 6, 7, 8, 9, 12], "posix": 2, "style": 2, "command": [2, 4], "yaml": 2, "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": [2, 12], "variabl": [2, 5], "hard": 2, "code": [2, 9], "conifg": 2, "defaut": 2, "global": [2, 4], "do": 2, "some": [2, 7, 9, 12], "setup": [2, 12], "task": 2, "creat": 2, "work": [2, 3, 9], "directori": [2, 3, 4, 8, 12], "log": [2, 12], "cach": [2, 3, 8, 12], "checkpoint": [2, 3, 12], "temp_dir": 2, "tempfil": 2, "origin": [2, 4, 5, 7, 9, 12], "op_name_class": 2, "split": [2, 6, 9], "item": [2, 3, 5], "sort": [2, 10], "them": [2, 4, 7, 12], "concat": 2, "togeth": 2, "pair": [2, 3, 5, 7], "op_nam": [2, 3, 12], "op_class": 2, "It": [3, 7], "multipl": [3, 4, 6, 12], "gener": [3, 9, 12], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": 3, "dict": [3, 12], "load_data_np": 3, "pipelin": 3, "worker": [3, 12], "karg": 3, "enhanc": 3, "huggingfac": [3, 4, 12], "usabl": 3, "effici": 3, "overrid": [3, 12], "add": [3, 4], "func": 3, "can": [3, 9], "access": 3, "nest": 3, "manner": 3, "clear": 3, "raw": 3, "call": [3, 7], "most": 3, "oper": [3, 5, 12], "classmethod": [3, 12], "from_xx": 3, "constructor": 3, "construct": 3, "remov": [3, 5, 6, 9, 12], "datasetdict": [3, 4], "obj": 3, "wrap": 3, "root_obj": 3, "kei": [3, 4, 5, 8, 10, 12], "check": [3, 4, 12], "flatten": 3, "layer": 3, "queri": 3, "date": 3, "befor": [3, 12], "conduct": [3, 4, 5], "actual": 3, "ones": 3, "unifi": [3, 4], "order": [3, 10], "export_path": 3, "export_shard_s": 3, "0": [3, 4, 5, 7, 8, 12], "export_in_parallel": 3, "num_proc": [3, 4, 12], "1": [3, 4, 9, 12], "export_d": 3, "export_stat": 3, "The": [3, 4, 5, 7, 8, 9, 10], "1073741824": 3, "1024": 3, "1048576": 3, "1099511627776": 3, "size": [3, 6, 7, 8], "shard": 3, "content": [3, 12], "static": [3, 12], "jsonl": [3, 4], "target": [3, 4, 8, 10, 12], "parquet": [3, 4], "rai": 3, "experiment": 3, "juicer": 3, "cluster": 3, "support": [3, 9, 12], "now": [3, 6, 9], "advanc": 3, "work_dir": 3, "show_num": [3, 5, 7], "10": [3, 8], "trace": [3, 5, 7], "chang": [3, 7, 9, 12], "comparison": 3, "maximum": [3, 8], "str": [3, 4, 5, 6, 7, 8, 9, 10, 12], "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": [3, 12], "batchmapp": 3, "mainli": 3, "new": [3, 4, 12], "augment": [3, 6, 8, 9], "dup_pair": 3, "duplic": [3, 5, 7], "differ": [3, 4, 6, 7, 9], "other": 3, "two": [3, 7], "embed": 3, "independ": [3, 9], "obtain": [3, 6], "due": [3, 7], "modif": [3, 4], "dataset_path": 4, "csv": 4, "specifi": [4, 6, 8, 10, 12], "tupl": [4, 7, 8, 12], "local": [4, 7], "packag": [4, 7], "info": [4, 5, 12], "int": [4, 7, 9, 12], "global_cfg": 4, "consequ": 4, "repositori": 4, "hub": 4, "featur": 4, "appropri": 4, "read": 4, "intern": 4, "follow": 4, "out": 4, "those": 4, "empti": 4, "sinc": [4, 6], "mai": 4, "modifi": 4, "unified_format_dataset": 4, "zst": 4, "mixtur": 4, "weight": [4, 7], "accord": [4, 5, 7], "mix": 4, "randomli": [4, 9], "everi": 4, "merg": [4, 6, 8], "datasset": 4, "dir": 4, "w1": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "txt": [4, 8, 12], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": [4, 12], "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "h": 4, "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "py": 4, "rb": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": 4, "rst": 4, "smali": 4, "datas": 4, "fn": 4, "tgt_path": 4, "tsv": 4, "delimit": 4, "union": [5, 6], "should_keep_long_word": [5, 9], "should_keep_word_with_incorrect_substr": [5, 9], "valu": [5, 7, 8, 10, 12], "For": [5, 7, 8, 9], "doc": [5, 7], "level": [5, 6, 7, 8, 9, 10, 12], "open": [5, 7, 9], "metric": [5, 8], "decid": [5, 8, 12], "intermedi": [5, 8], "var": [5, 8], "temporarili": [5, 8, 12], "boolean": [5, 8], "edit": 5, "process_list": 5, "A": 5, "fuse": 5, "share": 5, "same": [5, 12], "instanc": [5, 12], "fused_filt": 5, "fused_fil": 5, "original_filter_group": 5, "definit": [5, 9], "correspond": [5, 8, 10, 12], "x": 6, "document": [6, 7, 8, 9], "model_func": 6, "splite": 6, "separ": [6, 8, 10], "token_func": 6, "new_lin": 6, "tab": 6, "ratio": [6, 8, 10], "like": [6, 7, 9, 12], "stopword": [6, 8], "token": [6, 7, 8, 9, 12], "invert": 6, "concaten": 6, "setenc": 6, "also": 6, "space": [6, 7], "tag": 6, "strip_charact": 6, "wai": 6, "faster": 6, "than": [6, 7, 8], "set": [6, 10], "instead": 6, "contain": [6, 12], "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "uesd": 6, "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 12], "vietnames": [6, 8], "syllabl": 6, "ad": 6, "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7], "lower": [6, 7], "case": [6, 7], "lowercas": [6, 7], "char": [6, 8, 9], "bool": [7, 8, 9, 10, 12], "ignore_non_charact": 7, "exact": 7, "match": 7, "md5": 7, "ignor": 7, "alphabet": [7, 8], "whitespac": [7, 9], "digit": 7, "punctuat": [7, 9], "window_s": 7, "positiveint": [7, 8, 9, 10], "5": [7, 8], "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "closedunitinterv": [7, 8, 10], "7": 7, "num_band": 7, "num_rows_per_band": 7, "minhashlsh": 7, "byte": [7, 12], "so": [7, 9, 12], "thei": 7, "won": 7, "kept": 7, "final": 7, "should": [7, 12], "english": [7, 9], "recommend": [7, 9], "And": 7, "shingl": 7, "pattern": 7, "permut": 7, "min": [7, 8, 9], "jaccard": 7, "similar": [7, 9], "threshold": 7, "detect": 7, "regard": 7, "band": 7, "lsh": 7, "determin": [7, 10], "optim": 7, "algorithm": [7, 12], "minim": 7, "sum": 7, "prob": 7, "posit": 7, "neg": 7, "float": [7, 8], "num_perm": 7, "false_positive_weight": 7, "false_negative_weight": 7, "probabl": 7, "taken": 7, "datasketch": 7, "b": 7, "per": 7, "directli": 7, "avoid": [7, 12], "depend": 7, "6": [7, 8], "num_block": 7, "hamming_dist": 7, "block": 7, "max": [7, 8, 9], "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8], "hash_a": 7, "hash_b": 7, "implement": 7, "calcul": 7, "bit": [7, 12], "integ": 7, "num_differing_bit": 7, "python": [7, 12], "version": [7, 12], "instal": 7, "3": [7, 8, 9], "9": 7, "librari": [7, 9], "cannot": 7, "compil": 7, "correctli": 7, "cython": 7, "fix": [7, 9], "incompat": 7, "recursionerror": 7, "occur": 7, "sometim": [7, 12], "our": 7, "otherwis": [7, 9, 12], "avail": [7, 8, 12], "min_ratio": 8, "25": 8, "max_ratio": 8, "positivefloat": 8, "9223372036854775807": [8, 9], "numer": 8, "within": [8, 9, 10, 12], "rang": [8, 9, 12], "count": 8, "alphanumer": 8, "below": [8, 9], "exce": [8, 9], "min_len": [8, 9], "max_len": [8, 9], "averag": 8, "length": [8, 9], "rep_len": 8, "gram": 8, "repetit": 8, "045": 8, "flagged_words_dir": 8, "home": 8, "runner": 8, "asset": [8, 12], "flag": 8, "consid": 8, "what": 8, "adopt": 8, "flagged_word": 8, "whose": [8, 12], "join": 8, "min_scor": 8, "8": 8, "confid": 8, "score": 8, "larger": 8, "identif": 8, "max_ppl": 8, "1500": 8, "field_kei": [8, 10], "target_valu": 8, "multi": [8, 10, 12], "retain": 8, "min_valu": 8, "max_valu": 8, "specifiednumericfield": 8, "stopwords_dir": 8, "exampl": [8, 12], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "hug": 8, "face": 8, "clean": [9, 12], "copyright": 9, "comment": 9, "begin": [9, 12], "email": 9, "ipv4": 9, "ipv6": 9, "address": 9, "link": [9, 12], "http": 9, "ftp": 9, "expand": 9, "macro": 9, "bodi": 9, "latex": 9, "unicod": 9, "error": 9, "sequenti": 9, "aug_num": 9, "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "delete_random_char": 9, "swap_random_char": 9, "insert_random_char": 9, "simpli": 9, "nlpaug": 9, "you": 9, "time": 9, "semant": 9, "might": [9, 12], "significantli": 9, "combin": [9, 12], "sequenc": 9, "would": 9, "opened_aug_method": 9, "delet": 9, "random": 9, "love": 9, "llm": 9, "swap": 9, "contigu": 9, "simul": 9, "spell": 9, "ai": 9, "ll": 9, "keyboard": 9, "ov4": 9, "ocr": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "replace_equivalent_num": 9, "nlpcda": 9, "notic": 9, "replac": [9, 12], "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "normal": 9, "bibliographi": 9, "end": [9, 12], "doc_typ": 9, "inlin": 9, "multilin": 9, "kind": 9, "drop_no_head": 9, "header": 9, "drop": 9, "long": 9, "chars_to_remov": 9, "min_col": 9, "from_2_to_20": 9, "max_col": 9, "regular": 9, "express": 9, "substr": 9, "incorrect": 9, "0x20": 9, "found": [9, 12], "here": [9, 12], "wikipedia": 9, "org": 9, "wiki": 9, "whitespace_charact": 9, "top_ratio": 10, "topk": 10, "frequenc": 10, "top": 10, "both": 10, "smaller": 10, "rule": 10, "descend": 10, "words_dir": 12, "words_typ": 12, "download": 12, "asset_link": 12, "team": 12, "ckpt_dir": 12, "original_process_list": 12, "latest": 12, "manag": 12, "rerun": 12, "reload": 12, "skip": 12, "ani": 12, "els": 12, "prefix": 12, "part": 12, "start": 12, "scratch": 12, "somm": 12, "left": 12, "unchang": 12, "op_arg": 12, "dump": 12, "abc": 12, "abstract": 12, "input_path": 12, "output": 12, "uncompress": 12, "compressor_format": 12, "zstd": 12, "up": 12, "prev_d": 12, "this_d": 12, "fingerprint": 12, "previou": 12, "current": 12, "There": 12, "overlap": 12, "must": 12, "again": 12, "fingerprintd": 12, "accept": 12, "cache_file_nam": 12, "rank": 12, "defin": 12, "turn": 12, "off": 12, "see": 12, "gzip": 12, "lz4": 12, "extractor_format": 12, "lock_fil": 12, "timeout": 12, "max_filename_length": 12, "unixfilelock": 12, "lock": 12, "compresss": 12, "automat": 12, "__dj__context__": 12, "__dj__meta__": 12, "__dj__stats__": 12, "__dj__suffix__": 12, "__dj__hash": 12, "__dj__minhash": 12, "__dj__simhash": 12, "__dj__line": 12, "__dj__refined_word": 12, "__dj__word": 12, "travers": 12, "absolut": 12, "rel": 12, "objet": 12, "dill": 12, "serial": 12, "failur": 12, "variou": 12, "transform": 12, "transform_arg": 12, "hide": 12, "caller_nam": 12, "stream": 12, "redirect": 12, "logger": 12, "loguru": 12, "caller": 12, "apex": 12, "pycocotool": 12, "buf": 12, "depth": 12, "locat": 12, "log_level": 12, "stdout": 12, "stderr": 12, "save_dir": 12, "distributed_rank": 12, "filenam": 12, "mode": 12, "train": 12, "test": 12, "devic": 12, "gpu": 12, "append": 12, "system": 12, "model_nam": 12, "forc": 12, "exist": 12, "model_path": 12, "full": 12, "forcefulli": 12, "mayb": 12, "incomplet": 12, "reason": 12, "model_kei": 12, "model_typ": 12, "sentencepiec": 12, "model_zoo": 12, "tokenzi": 12, "prepar": 12, "zh": 12, "fasttext": 12, "tokenizer_nam": 12, "kenlm": 12, "syntax": 12, "render": 12, "nltk": 12, "punkt": 12, "regist": 12, "repo": 12, "module_kei": 12, "properti": 12, "module_nam": 12, "module_cl": 12, "modulenam": 12, "pass": 12, "textformatter2": 12, "text_formatter2": 12, "d": [13, 14], "t": [13, 14], "_": [13, 14], "j": [13, 14], "u": [13, 14], "i": [13, 14], "c": [13, 14], "e": [13, 14], "r": [13, 14], "index": 13, "search": 13, "page": 13}, "objects": {"data_juicer.analysis": [[1, 0, 0, "-", "column_wise_analysis"], [1, 0, 0, "-", "diversity_analysis"], [1, 0, 0, "-", "overall_analysis"]], "data_juicer.analysis.column_wise_analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 3, 1, "", "get_row_col"]], "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyse"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.diversity_analysis": [[1, 1, 1, "", "DiversityAnalysis"], [1, 3, 1, "", "find_root_verb_and_its_dobj"], [1, 3, 1, "", "find_root_verb_and_its_dobj_in_string"], [1, 3, 1, "", "get_diversity"]], "data_juicer.analysis.diversity_analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyse"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.overall_analysis": [[1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.overall_analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyse"]], "data_juicer.config": [[2, 0, 0, "-", "config"]], "data_juicer.config.config": [[2, 3, 1, "", "config_backup"], [2, 3, 1, "", "display_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "init_setup_from_cfg"], [2, 3, 1, "", "sort_op_by_types_and_names"]], "data_juicer.core": [[3, 0, 0, "-", "analyser"], [3, 0, 0, "-", "data"], [3, 0, 0, "-", "executor"], [3, 0, 0, "-", "exporter"], [3, 0, 0, "-", "ray_executor"], [3, 0, 0, "-", "tracer"]], "data_juicer.core.analyser": [[3, 1, 1, "", "Analyser"]], "data_juicer.core.analyser.Analyser": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.data": [[3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "NestedDatasetDict"], [3, 1, 1, "", "NestedQueryDict"], [3, 3, 1, "", "nested_obj_factory"], [3, 3, 1, "", "nested_query"], [3, 3, 1, "", "wrap_func_with_nested_access"]], "data_juicer.core.data.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "map"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.data.NestedDatasetDict": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "map"]], "data_juicer.core.data.NestedQueryDict": [[3, 2, 1, "", "__init__"]], "data_juicer.core.executor": [[3, 1, 1, "", "Executor"]], "data_juicer.core.executor.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.exporter": [[3, 1, 1, "", "Exporter"]], "data_juicer.core.exporter.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.ray_executor": [[3, 1, 1, "", "RayExecutor"]], "data_juicer.core.ray_executor.RayExecutor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.tracer": [[3, 1, 1, "", "Tracer"]], "data_juicer.core.tracer.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 0, 0, "-", "csv_formatter"], [4, 0, 0, "-", "formatter"], [4, 0, 0, "-", "json_formatter"], [4, 0, 0, "-", "load"], [4, 0, 0, "-", "mixture_formatter"], [4, 0, 0, "-", "parquet_formatter"], [4, 0, 0, "-", "text_formatter"], [4, 0, 0, "-", "tsv_formatter"]], "data_juicer.format.csv_formatter": [[4, 1, 1, "", "CsvFormatter"]], "data_juicer.format.csv_formatter.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.formatter": [[4, 1, 1, "", "BaseFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 3, 1, "", "add_suffixes"], [4, 3, 1, "", "load_formatter"], [4, 3, 1, "", "unify_format"]], "data_juicer.format.formatter.BaseFormatter": [[4, 2, 1, "", "load_dataset"]], "data_juicer.format.formatter.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.formatter.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.json_formatter": [[4, 1, 1, "", "JsonFormatter"]], "data_juicer.format.json_formatter.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.load": [[4, 3, 1, "", "load_formatter"]], "data_juicer.format.mixture_formatter": [[4, 1, 1, "", "MixtureFormatter"]], "data_juicer.format.mixture_formatter.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.parquet_formatter": [[4, 1, 1, "", "ParquetFormatter"]], "data_juicer.format.parquet_formatter.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.text_formatter": [[4, 1, 1, "", "TextFormatter"], [4, 3, 1, "", "extract_txt_from_docx"], [4, 3, 1, "", "extract_txt_from_pdf"]], "data_juicer.format.text_formatter.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.tsv_formatter": [[4, 1, 1, "", "TsvFormatter"]], "data_juicer.format.tsv_formatter.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 0, 0, "-", "base_op"], [5, 0, 0, "-", "load"], [5, 0, 0, "-", "op_fusion"]], "data_juicer.ops.base_op": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"]], "data_juicer.ops.base_op.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"]], "data_juicer.ops.base_op.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"]], "data_juicer.ops.base_op.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "is_batched_op"], [5, 2, 1, "", "process"]], "data_juicer.ops.base_op.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"]], "data_juicer.ops.common": [[6, 0, 0, "-", "helper_func"], [6, 0, 0, "-", "special_characters"]], "data_juicer.ops.common.helper_func": [[6, 1, 1, "", "UnionFind"], [6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.common.helper_func.UnionFind": [[6, 2, 1, "", "__init__"], [6, 2, 1, "", "find"], [6, 2, 1, "", "union"]], "data_juicer.ops.deduplicator": [[7, 0, 0, "-", "document_deduplicator"], [7, 0, 0, "-", "document_minhash_deduplicator"], [7, 0, 0, "-", "document_simhash_deduplicator"]], "data_juicer.ops.deduplicator.document_deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"]], "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.document_minhash_deduplicator": [[7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 3, 1, "", "optimal_param"], [7, 3, 1, "", "sha1_hash32"]], "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.document_simhash_deduplicator": [[7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 3, 1, "", "local_num_differing_bits"], [7, 3, 1, "", "num_differing_bits_selector"]], "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 0, 0, "-", "alphanumeric_filter"], [8, 0, 0, "-", "average_line_length_filter"], [8, 0, 0, "-", "character_repetition_filter"], [8, 0, 0, "-", "flagged_words_filter"], [8, 0, 0, "-", "language_id_score_filter"], [8, 0, 0, "-", "maximum_line_length_filter"], [8, 0, 0, "-", "perplexity_filter"], [8, 0, 0, "-", "special_characters_filter"], [8, 0, 0, "-", "specified_field_filter"], [8, 0, 0, "-", "specified_numeric_field_filter"], [8, 0, 0, "-", "stopwords_filter"], [8, 0, 0, "-", "suffix_filter"], [8, 0, 0, "-", "text_length_filter"], [8, 0, 0, "-", "token_num_filter"], [8, 0, 0, "-", "word_num_filter"], [8, 0, 0, "-", "word_repetition_filter"]], "data_juicer.ops.filter.alphanumeric_filter": [[8, 1, 1, "", "AlphanumericFilter"]], "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.average_line_length_filter": [[8, 1, 1, "", "AverageLineLengthFilter"]], "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.character_repetition_filter": [[8, 1, 1, "", "CharacterRepetitionFilter"]], "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.flagged_words_filter": [[8, 1, 1, "", "FlaggedWordFilter"]], "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.language_id_score_filter": [[8, 1, 1, "", "LanguageIDScoreFilter"]], "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.maximum_line_length_filter": [[8, 1, 1, "", "MaximumLineLengthFilter"]], "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.perplexity_filter": [[8, 1, 1, "", "PerplexityFilter"]], "data_juicer.ops.filter.perplexity_filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.special_characters_filter": [[8, 1, 1, "", "SpecialCharactersFilter"]], "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.specified_field_filter": [[8, 1, 1, "", "SpecifiedFieldFilter"]], "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.specified_numeric_field_filter": [[8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 3, 1, "", "is_number"]], "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.stopwords_filter": [[8, 1, 1, "", "StopWordsFilter"]], "data_juicer.ops.filter.stopwords_filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.suffix_filter": [[8, 1, 1, "", "SuffixFilter"]], "data_juicer.ops.filter.suffix_filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.text_length_filter": [[8, 1, 1, "", "TextLengthFilter"]], "data_juicer.ops.filter.text_length_filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.token_num_filter": [[8, 1, 1, "", "TokenNumFilter"]], "data_juicer.ops.filter.token_num_filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.word_num_filter": [[8, 1, 1, "", "WordNumFilter"]], "data_juicer.ops.filter.word_num_filter.WordNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.word_repetition_filter": [[8, 1, 1, "", "WordRepetitionFilter"]], "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.load": [[5, 3, 1, "", "load_ops"]], "data_juicer.ops.mapper": [[9, 0, 0, "-", "clean_copyright_mapper"], [9, 0, 0, "-", "clean_email_mapper"], [9, 0, 0, "-", "clean_html_mapper"], [9, 0, 0, "-", "clean_ip_mapper"], [9, 0, 0, "-", "clean_links_mapper"], [9, 0, 0, "-", "expand_macro_mapper"], [9, 0, 0, "-", "fix_unicode_mapper"], [9, 0, 0, "-", "nlpaug_en_mapper"], [9, 0, 0, "-", "nlpcda_zh_mapper"], [9, 0, 0, "-", "punctuation_normalization_mapper"], [9, 0, 0, "-", "remove_bibliography_mapper"], [9, 0, 0, "-", "remove_comments_mapper"], [9, 0, 0, "-", "remove_header_mapper"], [9, 0, 0, "-", "remove_long_words_mapper"], [9, 0, 0, "-", "remove_specific_chars_mapper"], [9, 0, 0, "-", "remove_table_text_mapper"], [9, 0, 0, "-", "remove_words_with_incorrect_substrings_mapper"], [9, 0, 0, "-", "sentence_split_mapper"], [9, 0, 0, "-", "whitespace_normalization_mapper"]], "data_juicer.ops.mapper.clean_copyright_mapper": [[9, 1, 1, "", "CleanCopyrightMapper"]], "data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.clean_email_mapper": [[9, 1, 1, "", "CleanEmailMapper"]], "data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.clean_html_mapper": [[9, 1, 1, "", "CleanHtmlMapper"]], "data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.clean_ip_mapper": [[9, 1, 1, "", "CleanIpMapper"]], "data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.clean_links_mapper": [[9, 1, 1, "", "CleanLinksMapper"]], "data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.expand_macro_mapper": [[9, 1, 1, "", "ExpandMacroMapper"]], "data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.fix_unicode_mapper": [[9, 1, 1, "", "FixUnicodeMapper"]], "data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.nlpaug_en_mapper": [[9, 1, 1, "", "NlpaugEnMapper"]], "data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.nlpcda_zh_mapper": [[9, 1, 1, "", "NlpcdaZhMapper"]], "data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.punctuation_normalization_mapper": [[9, 1, 1, "", "PunctuationNormalizationMapper"]], "data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_bibliography_mapper": [[9, 1, 1, "", "RemoveBibliographyMapper"]], "data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_comments_mapper": [[9, 1, 1, "", "RemoveCommentsMapper"]], "data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_header_mapper": [[9, 1, 1, "", "RemoveHeaderMapper"]], "data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_long_words_mapper": [[9, 1, 1, "", "RemoveLongWordsMapper"]], "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.remove_specific_chars_mapper": [[9, 1, 1, "", "RemoveSpecificCharsMapper"]], "data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_table_text_mapper": [[9, 1, 1, "", "RemoveTableTextMapper"]], "data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper": [[9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"]], "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.sentence_split_mapper": [[9, 1, 1, "", "SentenceSplitMapper"]], "data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.whitespace_normalization_mapper": [[9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.op_fusion": [[5, 1, 1, "", "FusedFilter"], [5, 3, 1, "", "fuse_filter_group"], [5, 3, 1, "", "fuse_operators"]], "data_juicer.ops.op_fusion.FusedFilter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"]], "data_juicer.ops.selector": [[10, 0, 0, "-", "frequency_specified_field_selector"], [10, 0, 0, "-", "topk_specified_field_selector"]], "data_juicer.ops.selector.frequency_specified_field_selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"]], "data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.topk_specified_field_selector": [[10, 1, 1, "", "TopkSpecifiedFieldSelector"], [10, 3, 1, "", "to_number"]], "data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.utils": [[12, 0, 0, "-", "asset_utils"], [12, 0, 0, "-", "cache_utils"], [12, 0, 0, "-", "ckpt_utils"], [12, 0, 0, "-", "compress"], [12, 0, 0, "-", "constant"], [12, 0, 0, "-", "file_utils"], [12, 0, 0, "-", "fingerprint_utils"], [12, 0, 0, "-", "logger_utils"], [12, 0, 0, "-", "model_utils"], [12, 0, 0, "-", "registry"]], "data_juicer.utils.asset_utils": [[12, 3, 1, "", "load_words_asset"]], "data_juicer.utils.ckpt_utils": [[12, 1, 1, "", "CheckpointManager"]], "data_juicer.utils.ckpt_utils.CheckpointManager": [[12, 2, 1, "", "__init__"], [12, 2, 1, "", "check_ckpt"], [12, 2, 1, "", "check_ops_to_skip"], [12, 2, 1, "", "get_left_process_list"], [12, 2, 1, "", "load_ckpt"], [12, 2, 1, "", "record"], [12, 2, 1, "", "save_ckpt"]], "data_juicer.utils.compress": [[12, 1, 1, "", "BaseCompressor"], [12, 1, 1, "", "CacheCompressManager"], [12, 1, 1, "", "CompressManager"], [12, 1, 1, "", "CompressionOff"], [12, 1, 1, "", "Compressor"], [12, 1, 1, "", "Extractor"], [12, 1, 1, "", "FileLock"], [12, 1, 1, "", "GzipCompressor"], [12, 1, 1, "", "Lz4Compressor"], [12, 1, 1, "", "ZstdCompressor"], [12, 3, 1, "", "cleanup_compressed_cache_files"], [12, 3, 1, "", "compress"], [12, 3, 1, "", "decompress"]], "data_juicer.utils.compress.BaseCompressor": [[12, 2, 1, "", "compress"]], "data_juicer.utils.compress.CacheCompressManager": [[12, 2, 1, "", "__init__"], [12, 2, 1, "", "cleanup_cache_files"], [12, 2, 1, "", "compress"], [12, 2, 1, "", "decompress"], [12, 2, 1, "", "format_cache_file_name"]], "data_juicer.utils.compress.CompressManager": [[12, 2, 1, "", "__init__"], [12, 2, 1, "", "compress"], [12, 2, 1, "", "decompress"]], "data_juicer.utils.compress.Compressor": [[12, 2, 1, "", "compress"], [12, 4, 1, "", "compressors"]], "data_juicer.utils.compress.Extractor": [[12, 2, 1, "", "extract"]], "data_juicer.utils.compress.GzipCompressor": [[12, 2, 1, "", "compress"]], "data_juicer.utils.compress.Lz4Compressor": [[12, 2, 1, "", "compress"]], "data_juicer.utils.compress.ZstdCompressor": [[12, 2, 1, "", "compress"]], "data_juicer.utils.constant": [[12, 1, 1, "", "Fields"], [12, 1, 1, "", "HashKeys"], [12, 1, 1, "", "InterVars"], [12, 1, 1, "", "StatsKeys"]], "data_juicer.utils.constant.Fields": [[12, 4, 1, "", "context"], [12, 4, 1, "", "meta"], [12, 4, 1, "", "stats"], [12, 4, 1, "", "suffix"]], "data_juicer.utils.constant.HashKeys": [[12, 4, 1, "", "hash"], [12, 4, 1, "", "minhash"], [12, 4, 1, "", "simhash"]], "data_juicer.utils.constant.InterVars": [[12, 4, 1, "", "lines"], [12, 4, 1, "", "refined_words"], [12, 4, 1, "", "words"]], "data_juicer.utils.constant.StatsKeys": [[12, 4, 1, "", "alnum_ratio"], [12, 4, 1, "", "alpha_token_ratio"], [12, 4, 1, "", "avg_line_length"], [12, 4, 1, "", "char_rep_ratio"], [12, 4, 1, "", "flagged_words_ratio"], [12, 4, 1, "", "lang"], [12, 4, 1, "", "lang_score"], [12, 4, 1, "", "max_line_length"], [12, 4, 1, "", "num_token"], [12, 4, 1, "", "num_words"], [12, 4, 1, "", "perplexity"], [12, 4, 1, "", "special_char_ratio"], [12, 4, 1, "", "stopwords_ratio"], [12, 4, 1, "", "text_len"], [12, 4, 1, "", "word_rep_ratio"]], "data_juicer.utils.file_utils": [[12, 3, 1, "", "find_files_with_suffix"], [12, 3, 1, "", "is_absolute_path"]], "data_juicer.utils.fingerprint_utils": [[12, 1, 1, "", "Hasher"], [12, 3, 1, "", "generate_fingerprint"], [12, 3, 1, "", "update_fingerprint"]], "data_juicer.utils.fingerprint_utils.Hasher": [[12, 2, 1, "", "__init__"], [12, 4, 1, "", "dispatch"], [12, 2, 1, "", "hash"], [12, 2, 1, "", "hash_bytes"], [12, 2, 1, "", "hash_default"], [12, 2, 1, "", "hexdigest"], [12, 2, 1, "", "update"]], "data_juicer.utils.logger_utils": [[12, 1, 1, "", "HiddenPrints"], [12, 1, 1, "", "StreamToLoguru"], [12, 3, 1, "", "get_caller_name"], [12, 3, 1, "", "get_log_file_path"], [12, 3, 1, "", "redirect_sys_output"], [12, 3, 1, "", "setup_logger"]], "data_juicer.utils.logger_utils.StreamToLoguru": [[12, 2, 1, "", "__init__"], [12, 2, 1, "", "flush"], [12, 2, 1, "", "write"]], "data_juicer.utils.model_utils": [[12, 3, 1, "", "check_model"], [12, 3, 1, "", "get_model"], [12, 3, 1, "", "prepare_diversity_model"], [12, 3, 1, "", "prepare_fasttext_model"], [12, 3, 1, "", "prepare_huggingface_tokenizer"], [12, 3, 1, "", "prepare_kenlm_model"], [12, 3, 1, "", "prepare_model"], [12, 3, 1, "", "prepare_nltk_model"], [12, 3, 1, "", "prepare_sentencepiece_model"]], "data_juicer.utils.registry": [[12, 1, 1, "", "Registry"]], "data_juicer.utils.registry.Registry": [[12, 2, 1, "", "__init__"], [12, 2, 1, "", "get"], [12, 2, 1, "", "list"], [12, 5, 1, "", "modules"], [12, 5, 1, "", "name"], [12, 2, 1, "", "register_module"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"d": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "t": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "_": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "j": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "u": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "c": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "e": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "r": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "n": [1, 2, 6], "l": [1, 7, 8, 10, 11, 12], "y": 1, "": [1, 5, 6, 7, 8, 9, 10, 11, 12, 13], "data_juic": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14], "analysi": 1, "column_wise_analysi": 1, "diversity_analysi": 1, "overall_analysi": 1, "o": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "f": [2, 4, 8], "g": 2, "config": 2, "core": 3, "analys": 3, "data": [3, 13], "executor": 3, "export": 3, "ray_executor": 3, "tracer": 3, "m": [4, 6, 9], "format": 4, "csv_formatt": 4, "formatt": 4, "json_formatt": 4, "load": [4, 5], "mixture_formatt": 4, "parquet_formatt": 4, "text_formatt": 4, "tsv_formatt": 4, "p": [5, 6, 7, 8, 9, 10], "op": [5, 6, 7, 8, 9, 10], "base_op": 5, "op_fus": 5, "common": 6, "helper_func": 6, "special_charact": 6, "dedupl": 7, "document_dedupl": 7, "document_minhash_dedupl": 7, "document_simhash_dedupl": 7, "filter": 8, "alphanumeric_filt": 8, "average_line_length_filt": 8, "character_repetition_filt": 8, "flagged_words_filt": 8, "language_id_score_filt": 8, "maximum_line_length_filt": 8, "perplexity_filt": 8, "special_characters_filt": 8, "specified_field_filt": 8, "specified_numeric_field_filt": 8, "stopwords_filt": 8, "suffix_filt": 8, "text_length_filt": 8, "token_num_filt": 8, "word_num_filt": 8, "word_repetition_filt": 8, "mapper": 9, "clean_copyright_mapp": 9, "clean_email_mapp": 9, "clean_html_mapp": 9, "clean_ip_mapp": 9, "clean_links_mapp": 9, "expand_macro_mapp": 9, "fix_unicode_mapp": 9, "nlpaug_en_mapp": 9, "nlpcda_zh_mapp": 9, "punctuation_normalization_mapp": 9, "remove_bibliography_mapp": 9, "remove_comments_mapp": 9, "remove_header_mapp": 9, "remove_long_words_mapp": 9, "remove_specific_chars_mapp": 9, "remove_table_text_mapp": 9, "remove_words_with_incorrect_substrings_mapp": 9, "sentence_split_mapp": 9, "whitespace_normalization_mapp": 9, "selector": 10, "frequency_specified_field_selector": 10, "topk_specified_field_selector": 10, "util": 12, "asset_util": 12, "cache_util": 12, "ckpt_util": 12, "compress": 12, "constant": 12, "file_util": 12, "fingerprint_util": 12, "logger_util": 12, "model_util": 12, "registri": 12, "welcom": 13, "juicer": 13, "document": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"d a t a _ j u i c e r": [[0, "d-a-t-a-j-u-i-c-e-r"]], "d a t a _ j u i c e r . a n a l y s i s": [[1, "d-a-t-a-j-u-i-c-e-r-a-n-a-l-y-s-i-s"]], "data_juicer.analysis.column_wise_analysis": [[1, "module-data_juicer.analysis.column_wise_analysis"]], "data_juicer.analysis.diversity_analysis": [[1, "module-data_juicer.analysis.diversity_analysis"]], "data_juicer.analysis.overall_analysis": [[1, "module-data_juicer.analysis.overall_analysis"]], "d a t a _ j u i c e r . c o n f i g": [[2, "d-a-t-a-j-u-i-c-e-r-c-o-n-f-i-g"]], "data_juicer.config.config": [[2, "module-data_juicer.config.config"]], "d a t a _ j u i c e r . c o r e": [[3, "d-a-t-a-j-u-i-c-e-r-c-o-r-e"]], "data_juicer.core.analyser": [[3, "module-data_juicer.core.analyser"]], "data_juicer.core.data": [[3, "module-data_juicer.core.data"]], "data_juicer.core.executor": [[3, "module-data_juicer.core.executor"]], "data_juicer.core.exporter": [[3, "module-data_juicer.core.exporter"]], "data_juicer.core.ray_executor": [[3, "module-data_juicer.core.ray_executor"]], "data_juicer.core.tracer": [[3, "module-data_juicer.core.tracer"]], "d a t a _ j u i c e r . f o r m a t": [[4, "d-a-t-a-j-u-i-c-e-r-f-o-r-m-a-t"]], "data_juicer.format.csv_formatter": [[4, "module-data_juicer.format.csv_formatter"]], "data_juicer.format.formatter": [[4, "module-data_juicer.format.formatter"]], "data_juicer.format.json_formatter": [[4, "module-data_juicer.format.json_formatter"]], "data_juicer.format.load": [[4, "module-data_juicer.format.load"]], "data_juicer.format.mixture_formatter": [[4, "module-data_juicer.format.mixture_formatter"]], "data_juicer.format.parquet_formatter": [[4, "module-data_juicer.format.parquet_formatter"]], "data_juicer.format.text_formatter": [[4, "module-data_juicer.format.text_formatter"]], "data_juicer.format.tsv_formatter": [[4, "module-data_juicer.format.tsv_formatter"]], "d a t a _ j u i c e r . o p s": [[5, "d-a-t-a-j-u-i-c-e-r-o-p-s"]], "data_juicer.ops.base_op": [[5, "module-data_juicer.ops.base_op"]], "data_juicer.ops.load": [[5, "module-data_juicer.ops.load"]], "data_juicer.ops.op_fusion": [[5, "module-data_juicer.ops.op_fusion"]], "d a t a _ j u i c e r . o p s . c o m m o n": [[6, "d-a-t-a-j-u-i-c-e-r-o-p-s-c-o-m-m-o-n"]], "data_juicer.ops.common.helper_func": [[6, "module-data_juicer.ops.common.helper_func"]], "data_juicer.ops.common.special_characters": [[6, "module-data_juicer.ops.common.special_characters"]], "d a t a _ j u i c e r . o p s . d e d u p l i c a t o r": [[7, "d-a-t-a-j-u-i-c-e-r-o-p-s-d-e-d-u-p-l-i-c-a-t-o-r"]], "data_juicer.ops.deduplicator.document_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_deduplicator"]], "data_juicer.ops.deduplicator.document_minhash_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_minhash_deduplicator"]], "data_juicer.ops.deduplicator.document_simhash_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_simhash_deduplicator"]], "d a t a _ j u i c e r . o p s . f i l t e r": [[8, "d-a-t-a-j-u-i-c-e-r-o-p-s-f-i-l-t-e-r"]], "data_juicer.ops.filter.alphanumeric_filter": [[8, "module-data_juicer.ops.filter.alphanumeric_filter"]], "data_juicer.ops.filter.average_line_length_filter": [[8, "module-data_juicer.ops.filter.average_line_length_filter"]], "data_juicer.ops.filter.character_repetition_filter": [[8, "module-data_juicer.ops.filter.character_repetition_filter"]], "data_juicer.ops.filter.flagged_words_filter": [[8, "module-data_juicer.ops.filter.flagged_words_filter"]], "data_juicer.ops.filter.language_id_score_filter": [[8, "module-data_juicer.ops.filter.language_id_score_filter"]], "data_juicer.ops.filter.maximum_line_length_filter": [[8, "module-data_juicer.ops.filter.maximum_line_length_filter"]], "data_juicer.ops.filter.perplexity_filter": [[8, "module-data_juicer.ops.filter.perplexity_filter"]], "data_juicer.ops.filter.special_characters_filter": [[8, "module-data_juicer.ops.filter.special_characters_filter"]], "data_juicer.ops.filter.specified_field_filter": [[8, "module-data_juicer.ops.filter.specified_field_filter"]], "data_juicer.ops.filter.specified_numeric_field_filter": [[8, "module-data_juicer.ops.filter.specified_numeric_field_filter"]], "data_juicer.ops.filter.stopwords_filter": [[8, "module-data_juicer.ops.filter.stopwords_filter"]], "data_juicer.ops.filter.suffix_filter": [[8, "module-data_juicer.ops.filter.suffix_filter"]], "data_juicer.ops.filter.text_length_filter": [[8, "module-data_juicer.ops.filter.text_length_filter"]], "data_juicer.ops.filter.token_num_filter": [[8, "module-data_juicer.ops.filter.token_num_filter"]], "data_juicer.ops.filter.word_num_filter": [[8, "module-data_juicer.ops.filter.word_num_filter"]], "data_juicer.ops.filter.word_repetition_filter": [[8, "module-data_juicer.ops.filter.word_repetition_filter"]], "d a t a _ j u i c e r . o p s . m a p p e r": [[9, "d-a-t-a-j-u-i-c-e-r-o-p-s-m-a-p-p-e-r"]], "data_juicer.ops.mapper.clean_copyright_mapper": [[9, "module-data_juicer.ops.mapper.clean_copyright_mapper"]], "data_juicer.ops.mapper.clean_email_mapper": [[9, "module-data_juicer.ops.mapper.clean_email_mapper"]], "data_juicer.ops.mapper.clean_html_mapper": [[9, "module-data_juicer.ops.mapper.clean_html_mapper"]], "data_juicer.ops.mapper.clean_ip_mapper": [[9, "module-data_juicer.ops.mapper.clean_ip_mapper"]], "data_juicer.ops.mapper.clean_links_mapper": [[9, "module-data_juicer.ops.mapper.clean_links_mapper"]], "data_juicer.ops.mapper.expand_macro_mapper": [[9, "module-data_juicer.ops.mapper.expand_macro_mapper"]], "data_juicer.ops.mapper.fix_unicode_mapper": [[9, "module-data_juicer.ops.mapper.fix_unicode_mapper"]], "data_juicer.ops.mapper.nlpaug_en_mapper": [[9, "module-data_juicer.ops.mapper.nlpaug_en_mapper"]], "data_juicer.ops.mapper.nlpcda_zh_mapper": [[9, "module-data_juicer.ops.mapper.nlpcda_zh_mapper"]], "data_juicer.ops.mapper.punctuation_normalization_mapper": [[9, "module-data_juicer.ops.mapper.punctuation_normalization_mapper"]], "data_juicer.ops.mapper.remove_bibliography_mapper": [[9, "module-data_juicer.ops.mapper.remove_bibliography_mapper"]], "data_juicer.ops.mapper.remove_comments_mapper": [[9, "module-data_juicer.ops.mapper.remove_comments_mapper"]], "data_juicer.ops.mapper.remove_header_mapper": [[9, "module-data_juicer.ops.mapper.remove_header_mapper"]], "data_juicer.ops.mapper.remove_long_words_mapper": [[9, "module-data_juicer.ops.mapper.remove_long_words_mapper"]], "data_juicer.ops.mapper.remove_specific_chars_mapper": [[9, "module-data_juicer.ops.mapper.remove_specific_chars_mapper"]], "data_juicer.ops.mapper.remove_table_text_mapper": [[9, "module-data_juicer.ops.mapper.remove_table_text_mapper"]], "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper": [[9, "module-data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper"]], "data_juicer.ops.mapper.sentence_split_mapper": [[9, "module-data_juicer.ops.mapper.sentence_split_mapper"]], "data_juicer.ops.mapper.whitespace_normalization_mapper": [[9, "module-data_juicer.ops.mapper.whitespace_normalization_mapper"]], "d a t a _ j u i c e r . o p s . s e l e c t o r": [[10, "d-a-t-a-j-u-i-c-e-r-o-p-s-s-e-l-e-c-t-o-r"]], "data_juicer.ops.selector.frequency_specified_field_selector": [[10, "module-data_juicer.ops.selector.frequency_specified_field_selector"]], "data_juicer.ops.selector.topk_specified_field_selector": [[10, "module-data_juicer.ops.selector.topk_specified_field_selector"]], "d a t a _ j u i c e r . t o o l s": [[11, "d-a-t-a-j-u-i-c-e-r-t-o-o-l-s"]], "d a t a _ j u i c e r . u t i l s": [[12, "d-a-t-a-j-u-i-c-e-r-u-t-i-l-s"]], "data_juicer.utils.asset_utils": [[12, "module-data_juicer.utils.asset_utils"]], "data_juicer.utils.cache_utils": [[12, "module-data_juicer.utils.cache_utils"]], "data_juicer.utils.ckpt_utils": [[12, "module-data_juicer.utils.ckpt_utils"]], "data_juicer.utils.compress": [[12, "module-data_juicer.utils.compress"]], "data_juicer.utils.constant": [[12, "module-data_juicer.utils.constant"]], "data_juicer.utils.file_utils": [[12, "module-data_juicer.utils.file_utils"]], "data_juicer.utils.fingerprint_utils": [[12, "module-data_juicer.utils.fingerprint_utils"]], "data_juicer.utils.logger_utils": [[12, "module-data_juicer.utils.logger_utils"]], "data_juicer.utils.model_utils": [[12, "module-data_juicer.utils.model_utils"]], "data_juicer.utils.registry": [[12, "module-data_juicer.utils.registry"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "data_juicer": [[13, "data-juicer"], [14, "data-juicer"]], "Indices and tables": [[13, "indices-and-tables"]]}, "indexentries": {"columnwiseanalysis (class in data_juicer.analysis.column_wise_analysis)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis.diversity_analysis)": [[1, "data_juicer.analysis.diversity_analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis.overall_analysis)": [[1, "data_juicer.analysis.overall_analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.column_wise_analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversity_analysis.diversityanalysis method)": [[1, "data_juicer.analysis.diversity_analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overall_analysis.overallanalysis method)": [[1, "data_juicer.analysis.overall_analysis.OverallAnalysis.__init__"]], "analyse() (data_juicer.analysis.column_wise_analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.analyse"]], "analyse() (data_juicer.analysis.diversity_analysis.diversityanalysis method)": [[1, "data_juicer.analysis.diversity_analysis.DiversityAnalysis.analyse"]], "analyse() (data_juicer.analysis.overall_analysis.overallanalysis method)": [[1, "data_juicer.analysis.overall_analysis.OverallAnalysis.analyse"]], "compute() (data_juicer.analysis.diversity_analysis.diversityanalysis method)": [[1, "data_juicer.analysis.diversity_analysis.DiversityAnalysis.compute"]], "data_juicer.analysis.column_wise_analysis": [[1, "module-data_juicer.analysis.column_wise_analysis"]], "data_juicer.analysis.diversity_analysis": [[1, "module-data_juicer.analysis.diversity_analysis"]], "data_juicer.analysis.overall_analysis": [[1, "module-data_juicer.analysis.overall_analysis"]], "draw_box() (data_juicer.analysis.column_wise_analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.column_wise_analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis.draw_hist"]], "find_root_verb_and_its_dobj() (in module data_juicer.analysis.diversity_analysis)": [[1, "data_juicer.analysis.diversity_analysis.find_root_verb_and_its_dobj"]], "find_root_verb_and_its_dobj_in_string() (in module data_juicer.analysis.diversity_analysis)": [[1, "data_juicer.analysis.diversity_analysis.find_root_verb_and_its_dobj_in_string"]], "get_diversity() (in module data_juicer.analysis.diversity_analysis)": [[1, "data_juicer.analysis.diversity_analysis.get_diversity"]], "get_row_col() (in module data_juicer.analysis.column_wise_analysis)": [[1, "data_juicer.analysis.column_wise_analysis.get_row_col"]], "module": [[1, "module-data_juicer.analysis.column_wise_analysis"], [1, "module-data_juicer.analysis.diversity_analysis"], [1, "module-data_juicer.analysis.overall_analysis"], [2, "module-data_juicer.config.config"], [3, "module-data_juicer.core.analyser"], [3, "module-data_juicer.core.data"], [3, "module-data_juicer.core.executor"], [3, "module-data_juicer.core.exporter"], [3, "module-data_juicer.core.ray_executor"], [3, "module-data_juicer.core.tracer"], [4, "module-data_juicer.format.csv_formatter"], [4, "module-data_juicer.format.formatter"], [4, "module-data_juicer.format.json_formatter"], [4, "module-data_juicer.format.load"], [4, "module-data_juicer.format.mixture_formatter"], [4, "module-data_juicer.format.parquet_formatter"], [4, "module-data_juicer.format.text_formatter"], [4, "module-data_juicer.format.tsv_formatter"], [5, "module-data_juicer.ops.base_op"], [5, "module-data_juicer.ops.load"], [5, "module-data_juicer.ops.op_fusion"], [6, "module-data_juicer.ops.common.helper_func"], [6, "module-data_juicer.ops.common.special_characters"], [7, "module-data_juicer.ops.deduplicator.document_deduplicator"], [7, "module-data_juicer.ops.deduplicator.document_minhash_deduplicator"], [7, "module-data_juicer.ops.deduplicator.document_simhash_deduplicator"], [8, "module-data_juicer.ops.filter.alphanumeric_filter"], [8, "module-data_juicer.ops.filter.average_line_length_filter"], [8, "module-data_juicer.ops.filter.character_repetition_filter"], [8, "module-data_juicer.ops.filter.flagged_words_filter"], [8, "module-data_juicer.ops.filter.language_id_score_filter"], [8, "module-data_juicer.ops.filter.maximum_line_length_filter"], [8, "module-data_juicer.ops.filter.perplexity_filter"], [8, "module-data_juicer.ops.filter.special_characters_filter"], [8, "module-data_juicer.ops.filter.specified_field_filter"], [8, "module-data_juicer.ops.filter.specified_numeric_field_filter"], [8, "module-data_juicer.ops.filter.stopwords_filter"], [8, "module-data_juicer.ops.filter.suffix_filter"], [8, "module-data_juicer.ops.filter.text_length_filter"], [8, "module-data_juicer.ops.filter.token_num_filter"], [8, "module-data_juicer.ops.filter.word_num_filter"], [8, "module-data_juicer.ops.filter.word_repetition_filter"], [9, "module-data_juicer.ops.mapper.clean_copyright_mapper"], [9, "module-data_juicer.ops.mapper.clean_email_mapper"], [9, "module-data_juicer.ops.mapper.clean_html_mapper"], [9, "module-data_juicer.ops.mapper.clean_ip_mapper"], [9, "module-data_juicer.ops.mapper.clean_links_mapper"], [9, "module-data_juicer.ops.mapper.expand_macro_mapper"], [9, "module-data_juicer.ops.mapper.fix_unicode_mapper"], [9, "module-data_juicer.ops.mapper.nlpaug_en_mapper"], [9, "module-data_juicer.ops.mapper.nlpcda_zh_mapper"], [9, "module-data_juicer.ops.mapper.punctuation_normalization_mapper"], [9, "module-data_juicer.ops.mapper.remove_bibliography_mapper"], [9, "module-data_juicer.ops.mapper.remove_comments_mapper"], [9, "module-data_juicer.ops.mapper.remove_header_mapper"], [9, "module-data_juicer.ops.mapper.remove_long_words_mapper"], [9, "module-data_juicer.ops.mapper.remove_specific_chars_mapper"], [9, "module-data_juicer.ops.mapper.remove_table_text_mapper"], [9, "module-data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper"], [9, "module-data_juicer.ops.mapper.sentence_split_mapper"], [9, "module-data_juicer.ops.mapper.whitespace_normalization_mapper"], [10, "module-data_juicer.ops.selector.frequency_specified_field_selector"], [10, "module-data_juicer.ops.selector.topk_specified_field_selector"], [12, "module-data_juicer.utils.asset_utils"], [12, "module-data_juicer.utils.cache_utils"], [12, "module-data_juicer.utils.ckpt_utils"], [12, "module-data_juicer.utils.compress"], [12, "module-data_juicer.utils.constant"], [12, "module-data_juicer.utils.file_utils"], [12, "module-data_juicer.utils.fingerprint_utils"], [12, "module-data_juicer.utils.logger_utils"], [12, "module-data_juicer.utils.model_utils"], [12, "module-data_juicer.utils.registry"]], "config_backup() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.config_backup"]], "data_juicer.config.config": [[2, "module-data_juicer.config.config"]], "display_config() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.display_config"]], "init_configs() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.init_configs"]], "init_setup_from_cfg() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.init_setup_from_cfg"]], "sort_op_by_types_and_names() (in module data_juicer.config.config)": [[2, "data_juicer.config.config.sort_op_by_types_and_names"]], "analyser (class in data_juicer.core.analyser)": [[3, "data_juicer.core.analyser.Analyser"]], "executor (class in data_juicer.core.executor)": [[3, "data_juicer.core.executor.Executor"]], "exporter (class in data_juicer.core.exporter)": [[3, "data_juicer.core.exporter.Exporter"]], "gib (data_juicer.core.exporter.exporter attribute)": [[3, "data_juicer.core.exporter.Exporter.GiB"]], "kib (data_juicer.core.exporter.exporter attribute)": [[3, "data_juicer.core.exporter.Exporter.KiB"]], "mib (data_juicer.core.exporter.exporter attribute)": [[3, "data_juicer.core.exporter.Exporter.MiB"]], "nesteddataset (class in data_juicer.core.data)": [[3, "data_juicer.core.data.NestedDataset"]], "nesteddatasetdict (class in data_juicer.core.data)": [[3, "data_juicer.core.data.NestedDatasetDict"]], "nestedquerydict (class in data_juicer.core.data)": [[3, "data_juicer.core.data.NestedQueryDict"]], "rayexecutor (class in data_juicer.core.ray_executor)": [[3, "data_juicer.core.ray_executor.RayExecutor"]], "tib (data_juicer.core.exporter.exporter attribute)": [[3, "data_juicer.core.exporter.Exporter.TiB"]], "tracer (class in data_juicer.core.tracer)": [[3, "data_juicer.core.tracer.Tracer"]], "__init__() (data_juicer.core.analyser.analyser method)": [[3, "data_juicer.core.analyser.Analyser.__init__"]], "__init__() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.__init__"]], "__init__() (data_juicer.core.data.nesteddatasetdict method)": [[3, "data_juicer.core.data.NestedDatasetDict.__init__"]], "__init__() (data_juicer.core.data.nestedquerydict method)": [[3, "data_juicer.core.data.NestedQueryDict.__init__"]], "__init__() (data_juicer.core.executor.executor method)": [[3, "data_juicer.core.executor.Executor.__init__"]], "__init__() (data_juicer.core.exporter.exporter method)": [[3, "data_juicer.core.exporter.Exporter.__init__"]], "__init__() (data_juicer.core.ray_executor.rayexecutor method)": [[3, "data_juicer.core.ray_executor.RayExecutor.__init__"]], "__init__() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.__init__"]], "add_column() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.add_column"]], "cleanup_cache_files() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.cleanup_cache_files"]], "data_juicer.core.analyser": [[3, "module-data_juicer.core.analyser"]], "data_juicer.core.data": [[3, "module-data_juicer.core.data"]], "data_juicer.core.executor": [[3, "module-data_juicer.core.executor"]], "data_juicer.core.exporter": [[3, "module-data_juicer.core.exporter"]], "data_juicer.core.ray_executor": [[3, "module-data_juicer.core.ray_executor"]], "data_juicer.core.tracer": [[3, "module-data_juicer.core.tracer"]], "export() (data_juicer.core.exporter.exporter method)": [[3, "data_juicer.core.exporter.Exporter.export"]], "filter() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.filter"]], "from_dict() (data_juicer.core.data.nesteddataset class method)": [[3, "data_juicer.core.data.NestedDataset.from_dict"]], "map() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.map"]], "map() (data_juicer.core.data.nesteddatasetdict method)": [[3, "data_juicer.core.data.NestedDatasetDict.map"]], "nested_obj_factory() (in module data_juicer.core.data)": [[3, "data_juicer.core.data.nested_obj_factory"]], "nested_query() (in module data_juicer.core.data)": [[3, "data_juicer.core.data.nested_query"]], "remove_columns() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyser.analyser method)": [[3, "data_juicer.core.analyser.Analyser.run"]], "run() (data_juicer.core.executor.executor method)": [[3, "data_juicer.core.executor.Executor.run"]], "run() (data_juicer.core.ray_executor.rayexecutor method)": [[3, "data_juicer.core.ray_executor.RayExecutor.run"]], "select() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.select"]], "select_columns() (data_juicer.core.data.nesteddataset method)": [[3, "data_juicer.core.data.NestedDataset.select_columns"]], "to_jsonl() (data_juicer.core.exporter.exporter static method)": [[3, "data_juicer.core.exporter.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter.exporter static method)": [[3, "data_juicer.core.exporter.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer.tracer method)": [[3, "data_juicer.core.tracer.Tracer.trace_mapper"]], "wrap_func_with_nested_access() (in module data_juicer.core.data)": [[3, "data_juicer.core.data.wrap_func_with_nested_access"]], "baseformatter (class in data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.BaseFormatter"]], "csvformatter (class in data_juicer.format.csv_formatter)": [[4, "data_juicer.format.csv_formatter.CsvFormatter"]], "jsonformatter (class in data_juicer.format.json_formatter)": [[4, "data_juicer.format.json_formatter.JsonFormatter"]], "localformatter (class in data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.LocalFormatter"]], "mixtureformatter (class in data_juicer.format.mixture_formatter)": [[4, "data_juicer.format.mixture_formatter.MixtureFormatter"]], "parquetformatter (class in data_juicer.format.parquet_formatter)": [[4, "data_juicer.format.parquet_formatter.ParquetFormatter"]], "remoteformatter (class in data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.RemoteFormatter"]], "suffixes (data_juicer.format.csv_formatter.csvformatter attribute)": [[4, "data_juicer.format.csv_formatter.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.json_formatter.jsonformatter attribute)": [[4, "data_juicer.format.json_formatter.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquet_formatter.parquetformatter attribute)": [[4, "data_juicer.format.parquet_formatter.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.text_formatter.textformatter attribute)": [[4, "data_juicer.format.text_formatter.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsv_formatter.tsvformatter attribute)": [[4, "data_juicer.format.tsv_formatter.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format.text_formatter)": [[4, "data_juicer.format.text_formatter.TextFormatter"]], "tsvformatter (class in data_juicer.format.tsv_formatter)": [[4, "data_juicer.format.tsv_formatter.TsvFormatter"]], "__init__() (data_juicer.format.csv_formatter.csvformatter method)": [[4, "data_juicer.format.csv_formatter.CsvFormatter.__init__"]], "__init__() (data_juicer.format.formatter.localformatter method)": [[4, "data_juicer.format.formatter.LocalFormatter.__init__"]], "__init__() (data_juicer.format.formatter.remoteformatter method)": [[4, "data_juicer.format.formatter.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.json_formatter.jsonformatter method)": [[4, "data_juicer.format.json_formatter.JsonFormatter.__init__"]], "__init__() (data_juicer.format.mixture_formatter.mixtureformatter method)": [[4, "data_juicer.format.mixture_formatter.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquet_formatter.parquetformatter method)": [[4, "data_juicer.format.parquet_formatter.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.text_formatter.textformatter method)": [[4, "data_juicer.format.text_formatter.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsv_formatter.tsvformatter method)": [[4, "data_juicer.format.tsv_formatter.TsvFormatter.__init__"]], "add_suffixes() (in module data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.add_suffixes"]], "data_juicer.format.csv_formatter": [[4, "module-data_juicer.format.csv_formatter"]], "data_juicer.format.formatter": [[4, "module-data_juicer.format.formatter"]], "data_juicer.format.json_formatter": [[4, "module-data_juicer.format.json_formatter"]], "data_juicer.format.load": [[4, "module-data_juicer.format.load"]], "data_juicer.format.mixture_formatter": [[4, "module-data_juicer.format.mixture_formatter"]], "data_juicer.format.parquet_formatter": [[4, "module-data_juicer.format.parquet_formatter"]], "data_juicer.format.text_formatter": [[4, "module-data_juicer.format.text_formatter"]], "data_juicer.format.tsv_formatter": [[4, "module-data_juicer.format.tsv_formatter"]], "extract_txt_from_docx() (in module data_juicer.format.text_formatter)": [[4, "data_juicer.format.text_formatter.extract_txt_from_docx"]], "extract_txt_from_pdf() (in module data_juicer.format.text_formatter)": [[4, "data_juicer.format.text_formatter.extract_txt_from_pdf"]], "load_dataset() (data_juicer.format.formatter.baseformatter method)": [[4, "data_juicer.format.formatter.BaseFormatter.load_dataset"]], "load_dataset() (data_juicer.format.formatter.localformatter method)": [[4, "data_juicer.format.formatter.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.formatter.remoteformatter method)": [[4, "data_juicer.format.formatter.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixture_formatter.mixtureformatter method)": [[4, "data_juicer.format.mixture_formatter.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.text_formatter.textformatter method)": [[4, "data_juicer.format.text_formatter.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.load_formatter"]], "load_formatter() (in module data_juicer.format.load)": [[4, "data_juicer.format.load.load_formatter"]], "unify_format() (in module data_juicer.format.formatter)": [[4, "data_juicer.format.formatter.unify_format"]], "deduplicator (class in data_juicer.ops.base_op)": [[5, "data_juicer.ops.base_op.Deduplicator"]], "filter (class in data_juicer.ops.base_op)": [[5, "data_juicer.ops.base_op.Filter"]], "fusedfilter (class in data_juicer.ops.op_fusion)": [[5, "data_juicer.ops.op_fusion.FusedFilter"]], "mapper (class in data_juicer.ops.base_op)": [[5, "data_juicer.ops.base_op.Mapper"]], "selector (class in data_juicer.ops.base_op)": [[5, "data_juicer.ops.base_op.Selector"]], "__init__() (data_juicer.ops.base_op.deduplicator method)": [[5, "data_juicer.ops.base_op.Deduplicator.__init__"]], "__init__() (data_juicer.ops.base_op.filter method)": [[5, "data_juicer.ops.base_op.Filter.__init__"]], "__init__() (data_juicer.ops.base_op.mapper method)": [[5, "data_juicer.ops.base_op.Mapper.__init__"]], "__init__() (data_juicer.ops.base_op.selector method)": [[5, "data_juicer.ops.base_op.Selector.__init__"]], "__init__() (data_juicer.ops.op_fusion.fusedfilter method)": [[5, "data_juicer.ops.op_fusion.FusedFilter.__init__"]], "compute_hash() (data_juicer.ops.base_op.deduplicator method)": [[5, "data_juicer.ops.base_op.Deduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.base_op.filter method)": [[5, "data_juicer.ops.base_op.Filter.compute_stats"]], "compute_stats() (data_juicer.ops.op_fusion.fusedfilter method)": [[5, "data_juicer.ops.op_fusion.FusedFilter.compute_stats"]], "data_juicer.ops.base_op": [[5, "module-data_juicer.ops.base_op"]], "data_juicer.ops.load": [[5, "module-data_juicer.ops.load"]], "data_juicer.ops.op_fusion": [[5, "module-data_juicer.ops.op_fusion"]], "fuse_filter_group() (in module data_juicer.ops.op_fusion)": [[5, "data_juicer.ops.op_fusion.fuse_filter_group"]], "fuse_operators() (in module data_juicer.ops.op_fusion)": [[5, "data_juicer.ops.op_fusion.fuse_operators"]], "is_batched_op() (data_juicer.ops.base_op.mapper method)": [[5, "data_juicer.ops.base_op.Mapper.is_batched_op"]], "load_ops() (in module data_juicer.ops.load)": [[5, "data_juicer.ops.load.load_ops"]], "process() (data_juicer.ops.base_op.deduplicator method)": [[5, "data_juicer.ops.base_op.Deduplicator.process"]], "process() (data_juicer.ops.base_op.filter method)": [[5, "data_juicer.ops.base_op.Filter.process"]], "process() (data_juicer.ops.base_op.mapper method)": [[5, "data_juicer.ops.base_op.Mapper.process"]], "process() (data_juicer.ops.base_op.selector method)": [[5, "data_juicer.ops.base_op.Selector.process"]], "process() (data_juicer.ops.op_fusion.fusedfilter method)": [[5, "data_juicer.ops.op_fusion.FusedFilter.process"]], "unionfind (class in data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.UnionFind"]], "__init__() (data_juicer.ops.common.helper_func.unionfind method)": [[6, "data_juicer.ops.common.helper_func.UnionFind.__init__"]], "data_juicer.ops.common.helper_func": [[6, "module-data_juicer.ops.common.helper_func"]], "data_juicer.ops.common.special_characters": [[6, "module-data_juicer.ops.common.special_characters"]], "find() (data_juicer.ops.common.helper_func.unionfind method)": [[6, "data_juicer.ops.common.helper_func.UnionFind.find"]], "get_sentences_from_document() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.split_on_whitespace"]], "strip() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.strip"]], "union() (data_juicer.ops.common.helper_func.unionfind method)": [[6, "data_juicer.ops.common.helper_func.UnionFind.union"]], "words_augmentation() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common.helper_func)": [[6, "data_juicer.ops.common.helper_func.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator.document_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator.document_minhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator.document_simhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.document_deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.document_minhash_deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.document_simhash_deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator.__init__"]], "compute_hash() (data_juicer.ops.deduplicator.document_deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.document_minhash_deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.document_simhash_deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "data_juicer.ops.deduplicator.document_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_deduplicator"]], "data_juicer.ops.deduplicator.document_minhash_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_minhash_deduplicator"]], "data_juicer.ops.deduplicator.document_simhash_deduplicator": [[7, "module-data_juicer.ops.deduplicator.document_simhash_deduplicator"]], "local_num_differing_bits() (in module data_juicer.ops.deduplicator.document_simhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.local_num_differing_bits"]], "num_differing_bits_selector() (in module data_juicer.ops.deduplicator.document_simhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.num_differing_bits_selector"]], "optimal_param() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.optimal_param"]], "process() (data_juicer.ops.deduplicator.document_deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.document_minhash_deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.document_simhash_deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator.process"]], "sha1_hash32() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator)": [[7, "data_juicer.ops.deduplicator.document_minhash_deduplicator.sha1_hash32"]], "alphanumericfilter (class in data_juicer.ops.filter.alphanumeric_filter)": [[8, "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter.average_line_length_filter)": [[8, "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter.character_repetition_filter)": [[8, "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter.flagged_words_filter)": [[8, "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter"]], "languageidscorefilter (class in data_juicer.ops.filter.language_id_score_filter)": [[8, "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter.maximum_line_length_filter)": [[8, "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter.perplexity_filter)": [[8, "data_juicer.ops.filter.perplexity_filter.PerplexityFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter.special_characters_filter)": [[8, "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter.specified_field_filter)": [[8, "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter.specified_numeric_field_filter)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter.stopwords_filter)": [[8, "data_juicer.ops.filter.stopwords_filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter.suffix_filter)": [[8, "data_juicer.ops.filter.suffix_filter.SuffixFilter"]], "textlengthfilter (class in data_juicer.ops.filter.text_length_filter)": [[8, "data_juicer.ops.filter.text_length_filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter.token_num_filter)": [[8, "data_juicer.ops.filter.token_num_filter.TokenNumFilter"]], "wordnumfilter (class in data_juicer.ops.filter.word_num_filter)": [[8, "data_juicer.ops.filter.word_num_filter.WordNumFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter.word_repetition_filter)": [[8, "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter"]], "__init__() (data_juicer.ops.filter.alphanumeric_filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.average_line_length_filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.character_repetition_filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flagged_words_filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.language_id_score_filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximum_line_length_filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexity_filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.perplexity_filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.special_characters_filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specified_field_filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specified_numeric_field_filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwords_filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.stopwords_filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffix_filter.suffixfilter method)": [[8, "data_juicer.ops.filter.suffix_filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.text_length_filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.text_length_filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.token_num_filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.token_num_filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.word_num_filter.wordnumfilter method)": [[8, "data_juicer.ops.filter.word_num_filter.WordNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.word_repetition_filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter.__init__"]], "compute_stats() (data_juicer.ops.filter.alphanumeric_filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.average_line_length_filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.character_repetition_filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.flagged_words_filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.language_id_score_filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.maximum_line_length_filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.perplexity_filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.perplexity_filter.PerplexityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.special_characters_filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specified_field_filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specified_numeric_field_filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.stopwords_filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.stopwords_filter.StopWordsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.suffix_filter.suffixfilter method)": [[8, "data_juicer.ops.filter.suffix_filter.SuffixFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.text_length_filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.text_length_filter.TextLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.token_num_filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.token_num_filter.TokenNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.word_num_filter.wordnumfilter method)": [[8, "data_juicer.ops.filter.word_num_filter.WordNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.word_repetition_filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter.compute_stats"]], "data_juicer.ops.filter.alphanumeric_filter": [[8, "module-data_juicer.ops.filter.alphanumeric_filter"]], "data_juicer.ops.filter.average_line_length_filter": [[8, "module-data_juicer.ops.filter.average_line_length_filter"]], "data_juicer.ops.filter.character_repetition_filter": [[8, "module-data_juicer.ops.filter.character_repetition_filter"]], "data_juicer.ops.filter.flagged_words_filter": [[8, "module-data_juicer.ops.filter.flagged_words_filter"]], "data_juicer.ops.filter.language_id_score_filter": [[8, "module-data_juicer.ops.filter.language_id_score_filter"]], "data_juicer.ops.filter.maximum_line_length_filter": [[8, "module-data_juicer.ops.filter.maximum_line_length_filter"]], "data_juicer.ops.filter.perplexity_filter": [[8, "module-data_juicer.ops.filter.perplexity_filter"]], "data_juicer.ops.filter.special_characters_filter": [[8, "module-data_juicer.ops.filter.special_characters_filter"]], "data_juicer.ops.filter.specified_field_filter": [[8, "module-data_juicer.ops.filter.specified_field_filter"]], "data_juicer.ops.filter.specified_numeric_field_filter": [[8, "module-data_juicer.ops.filter.specified_numeric_field_filter"]], "data_juicer.ops.filter.stopwords_filter": [[8, "module-data_juicer.ops.filter.stopwords_filter"]], "data_juicer.ops.filter.suffix_filter": [[8, "module-data_juicer.ops.filter.suffix_filter"]], "data_juicer.ops.filter.text_length_filter": [[8, "module-data_juicer.ops.filter.text_length_filter"]], "data_juicer.ops.filter.token_num_filter": [[8, "module-data_juicer.ops.filter.token_num_filter"]], "data_juicer.ops.filter.word_num_filter": [[8, "module-data_juicer.ops.filter.word_num_filter"]], "data_juicer.ops.filter.word_repetition_filter": [[8, "module-data_juicer.ops.filter.word_repetition_filter"]], "is_number() (in module data_juicer.ops.filter.specified_numeric_field_filter)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.is_number"]], "process() (data_juicer.ops.filter.alphanumeric_filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter.process"]], "process() (data_juicer.ops.filter.average_line_length_filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter.process"]], "process() (data_juicer.ops.filter.character_repetition_filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter.process"]], "process() (data_juicer.ops.filter.flagged_words_filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter.process"]], "process() (data_juicer.ops.filter.language_id_score_filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter.process"]], "process() (data_juicer.ops.filter.maximum_line_length_filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter.process"]], "process() (data_juicer.ops.filter.perplexity_filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.perplexity_filter.PerplexityFilter.process"]], "process() (data_juicer.ops.filter.special_characters_filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter.process"]], "process() (data_juicer.ops.filter.specified_field_filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter.process"]], "process() (data_juicer.ops.filter.specified_numeric_field_filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter.process"]], "process() (data_juicer.ops.filter.stopwords_filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.stopwords_filter.StopWordsFilter.process"]], "process() (data_juicer.ops.filter.suffix_filter.suffixfilter method)": [[8, "data_juicer.ops.filter.suffix_filter.SuffixFilter.process"]], "process() (data_juicer.ops.filter.text_length_filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.text_length_filter.TextLengthFilter.process"]], "process() (data_juicer.ops.filter.token_num_filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.token_num_filter.TokenNumFilter.process"]], "process() (data_juicer.ops.filter.word_num_filter.wordnumfilter method)": [[8, "data_juicer.ops.filter.word_num_filter.WordNumFilter.process"]], "process() (data_juicer.ops.filter.word_repetition_filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter.process"]], "cleancopyrightmapper (class in data_juicer.ops.mapper.clean_copyright_mapper)": [[9, "data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper.clean_email_mapper)": [[9, "data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper.clean_html_mapper)": [[9, "data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper.clean_ip_mapper)": [[9, "data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper.clean_links_mapper)": [[9, "data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper.expand_macro_mapper)": [[9, "data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper.fix_unicode_mapper)": [[9, "data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper.nlpaug_en_mapper)": [[9, "data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper.nlpcda_zh_mapper)": [[9, "data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper.punctuation_normalization_mapper)": [[9, "data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper.remove_bibliography_mapper)": [[9, "data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper.remove_comments_mapper)": [[9, "data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper.remove_header_mapper)": [[9, "data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper.remove_long_words_mapper)": [[9, "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper.remove_specific_chars_mapper)": [[9, "data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper.remove_table_text_mapper)": [[9, "data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper)": [[9, "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "sentencesplitmapper (class in data_juicer.ops.mapper.sentence_split_mapper)": [[9, "data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper.whitespace_normalization_mapper)": [[9, "data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.clean_copyright_mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.clean_email_mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.clean_html_mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.clean_ip_mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.clean_links_mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expand_macro_mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fix_unicode_mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaug_en_mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcda_zh_mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuation_normalization_mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.remove_bibliography_mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.remove_comments_mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.remove_header_mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.remove_long_words_mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.remove_specific_chars_mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.remove_table_text_mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentence_split_mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespace_normalization_mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper.__init__"]], "data_juicer.ops.mapper.clean_copyright_mapper": [[9, "module-data_juicer.ops.mapper.clean_copyright_mapper"]], "data_juicer.ops.mapper.clean_email_mapper": [[9, "module-data_juicer.ops.mapper.clean_email_mapper"]], "data_juicer.ops.mapper.clean_html_mapper": [[9, "module-data_juicer.ops.mapper.clean_html_mapper"]], "data_juicer.ops.mapper.clean_ip_mapper": [[9, "module-data_juicer.ops.mapper.clean_ip_mapper"]], "data_juicer.ops.mapper.clean_links_mapper": [[9, "module-data_juicer.ops.mapper.clean_links_mapper"]], "data_juicer.ops.mapper.expand_macro_mapper": [[9, "module-data_juicer.ops.mapper.expand_macro_mapper"]], "data_juicer.ops.mapper.fix_unicode_mapper": [[9, "module-data_juicer.ops.mapper.fix_unicode_mapper"]], "data_juicer.ops.mapper.nlpaug_en_mapper": [[9, "module-data_juicer.ops.mapper.nlpaug_en_mapper"]], "data_juicer.ops.mapper.nlpcda_zh_mapper": [[9, "module-data_juicer.ops.mapper.nlpcda_zh_mapper"]], "data_juicer.ops.mapper.punctuation_normalization_mapper": [[9, "module-data_juicer.ops.mapper.punctuation_normalization_mapper"]], "data_juicer.ops.mapper.remove_bibliography_mapper": [[9, "module-data_juicer.ops.mapper.remove_bibliography_mapper"]], "data_juicer.ops.mapper.remove_comments_mapper": [[9, "module-data_juicer.ops.mapper.remove_comments_mapper"]], "data_juicer.ops.mapper.remove_header_mapper": [[9, "module-data_juicer.ops.mapper.remove_header_mapper"]], "data_juicer.ops.mapper.remove_long_words_mapper": [[9, "module-data_juicer.ops.mapper.remove_long_words_mapper"]], "data_juicer.ops.mapper.remove_specific_chars_mapper": [[9, "module-data_juicer.ops.mapper.remove_specific_chars_mapper"]], "data_juicer.ops.mapper.remove_table_text_mapper": [[9, "module-data_juicer.ops.mapper.remove_table_text_mapper"]], "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper": [[9, "module-data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper"]], "data_juicer.ops.mapper.sentence_split_mapper": [[9, "module-data_juicer.ops.mapper.sentence_split_mapper"]], "data_juicer.ops.mapper.whitespace_normalization_mapper": [[9, "module-data_juicer.ops.mapper.whitespace_normalization_mapper"]], "process() (data_juicer.ops.mapper.clean_copyright_mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper.process"]], "process() (data_juicer.ops.mapper.clean_email_mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper.process"]], "process() (data_juicer.ops.mapper.clean_html_mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper.process"]], "process() (data_juicer.ops.mapper.clean_ip_mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper.process"]], "process() (data_juicer.ops.mapper.clean_links_mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper.process"]], "process() (data_juicer.ops.mapper.expand_macro_mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper.process"]], "process() (data_juicer.ops.mapper.fix_unicode_mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper.process"]], "process() (data_juicer.ops.mapper.nlpaug_en_mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper.process"]], "process() (data_juicer.ops.mapper.nlpcda_zh_mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper.process"]], "process() (data_juicer.ops.mapper.punctuation_normalization_mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper.process"]], "process() (data_juicer.ops.mapper.remove_bibliography_mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper.process"]], "process() (data_juicer.ops.mapper.remove_comments_mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper.process"]], "process() (data_juicer.ops.mapper.remove_header_mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper.process"]], "process() (data_juicer.ops.mapper.remove_long_words_mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper.process"]], "process() (data_juicer.ops.mapper.remove_specific_chars_mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper.process"]], "process() (data_juicer.ops.mapper.remove_table_text_mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper.process"]], "process() (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper.process"]], "process() (data_juicer.ops.mapper.sentence_split_mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper.process"]], "process() (data_juicer.ops.mapper.whitespace_normalization_mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper.process"]], "should_keep_long_word() (data_juicer.ops.mapper.remove_long_words_mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector.frequency_specified_field_selector)": [[10, "data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector.topk_specified_field_selector)": [[10, "data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequency_specified_field_selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topk_specified_field_selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector.frequency_specified_field_selector": [[10, "module-data_juicer.ops.selector.frequency_specified_field_selector"]], "data_juicer.ops.selector.topk_specified_field_selector": [[10, "module-data_juicer.ops.selector.topk_specified_field_selector"]], "process() (data_juicer.ops.selector.frequency_specified_field_selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topk_specified_field_selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector.process"]], "to_number() (in module data_juicer.ops.selector.topk_specified_field_selector)": [[10, "data_juicer.ops.selector.topk_specified_field_selector.to_number"]], "basecompressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.BaseCompressor"]], "cachecompressmanager (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.CacheCompressManager"]], "checkpointmanager (class in data_juicer.utils.ckpt_utils)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager"]], "compressmanager (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.CompressManager"]], "compressionoff (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.CompressionOff"]], "compressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.Compressor"]], "extractor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.Extractor"]], "fields (class in data_juicer.utils.constant)": [[12, "data_juicer.utils.constant.Fields"]], "filelock (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.FileLock"]], "gzipcompressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.GzipCompressor"]], "hashkeys (class in data_juicer.utils.constant)": [[12, "data_juicer.utils.constant.HashKeys"]], "hasher (class in data_juicer.utils.fingerprint_utils)": [[12, "data_juicer.utils.fingerprint_utils.Hasher"]], "hiddenprints (class in data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.HiddenPrints"]], "intervars (class in data_juicer.utils.constant)": [[12, "data_juicer.utils.constant.InterVars"]], "lz4compressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.Lz4Compressor"]], "registry (class in data_juicer.utils.registry)": [[12, "data_juicer.utils.registry.Registry"]], "statskeys (class in data_juicer.utils.constant)": [[12, "data_juicer.utils.constant.StatsKeys"]], "streamtologuru (class in data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.StreamToLoguru"]], "zstdcompressor (class in data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.ZstdCompressor"]], "__init__() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.__init__"]], "__init__() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.__init__"]], "__init__() (data_juicer.utils.compress.compressmanager method)": [[12, "data_juicer.utils.compress.CompressManager.__init__"]], "__init__() (data_juicer.utils.fingerprint_utils.hasher method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.__init__"]], "__init__() (data_juicer.utils.logger_utils.streamtologuru method)": [[12, "data_juicer.utils.logger_utils.StreamToLoguru.__init__"]], "__init__() (data_juicer.utils.registry.registry method)": [[12, "data_juicer.utils.registry.Registry.__init__"]], "alnum_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.alnum_ratio"]], "alpha_token_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.alpha_token_ratio"]], "avg_line_length (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.avg_line_length"]], "char_rep_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.char_rep_ratio"]], "check_ckpt() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.check_ckpt"]], "check_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.check_model"]], "check_ops_to_skip() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.check_ops_to_skip"]], "cleanup_cache_files() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.cleanup_cache_files"]], "cleanup_compressed_cache_files() (in module data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.cleanup_compressed_cache_files"]], "compress() (data_juicer.utils.compress.basecompressor static method)": [[12, "data_juicer.utils.compress.BaseCompressor.compress"]], "compress() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.compress"]], "compress() (data_juicer.utils.compress.compressmanager method)": [[12, "data_juicer.utils.compress.CompressManager.compress"]], "compress() (data_juicer.utils.compress.compressor class method)": [[12, "data_juicer.utils.compress.Compressor.compress"]], "compress() (data_juicer.utils.compress.gzipcompressor static method)": [[12, "data_juicer.utils.compress.GzipCompressor.compress"]], "compress() (data_juicer.utils.compress.lz4compressor static method)": [[12, "data_juicer.utils.compress.Lz4Compressor.compress"]], "compress() (data_juicer.utils.compress.zstdcompressor static method)": [[12, "data_juicer.utils.compress.ZstdCompressor.compress"]], "compress() (in module data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.compress"]], "compressors (data_juicer.utils.compress.compressor attribute)": [[12, "data_juicer.utils.compress.Compressor.compressors"]], "context (data_juicer.utils.constant.fields attribute)": [[12, "data_juicer.utils.constant.Fields.context"]], "data_juicer.utils.asset_utils": [[12, "module-data_juicer.utils.asset_utils"]], "data_juicer.utils.cache_utils": [[12, "module-data_juicer.utils.cache_utils"]], "data_juicer.utils.ckpt_utils": [[12, "module-data_juicer.utils.ckpt_utils"]], "data_juicer.utils.compress": [[12, "module-data_juicer.utils.compress"]], "data_juicer.utils.constant": [[12, "module-data_juicer.utils.constant"]], "data_juicer.utils.file_utils": [[12, "module-data_juicer.utils.file_utils"]], "data_juicer.utils.fingerprint_utils": [[12, "module-data_juicer.utils.fingerprint_utils"]], "data_juicer.utils.logger_utils": [[12, "module-data_juicer.utils.logger_utils"]], "data_juicer.utils.model_utils": [[12, "module-data_juicer.utils.model_utils"]], "data_juicer.utils.registry": [[12, "module-data_juicer.utils.registry"]], "decompress() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.decompress"]], "decompress() (data_juicer.utils.compress.compressmanager method)": [[12, "data_juicer.utils.compress.CompressManager.decompress"]], "decompress() (in module data_juicer.utils.compress)": [[12, "data_juicer.utils.compress.decompress"]], "dispatch (data_juicer.utils.fingerprint_utils.hasher attribute)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.dispatch"]], "extract() (data_juicer.utils.compress.extractor class method)": [[12, "data_juicer.utils.compress.Extractor.extract"]], "find_files_with_suffix() (in module data_juicer.utils.file_utils)": [[12, "data_juicer.utils.file_utils.find_files_with_suffix"]], "flagged_words_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.flagged_words_ratio"]], "flush() (data_juicer.utils.logger_utils.streamtologuru method)": [[12, "data_juicer.utils.logger_utils.StreamToLoguru.flush"]], "format_cache_file_name() (data_juicer.utils.compress.cachecompressmanager method)": [[12, "data_juicer.utils.compress.CacheCompressManager.format_cache_file_name"]], "generate_fingerprint() (in module data_juicer.utils.fingerprint_utils)": [[12, "data_juicer.utils.fingerprint_utils.generate_fingerprint"]], "get() (data_juicer.utils.registry.registry method)": [[12, "data_juicer.utils.registry.Registry.get"]], "get_caller_name() (in module data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.get_caller_name"]], "get_left_process_list() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.get_left_process_list"]], "get_log_file_path() (in module data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.get_log_file_path"]], "get_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.get_model"]], "hash (data_juicer.utils.constant.hashkeys attribute)": [[12, "data_juicer.utils.constant.HashKeys.hash"]], "hash() (data_juicer.utils.fingerprint_utils.hasher class method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.hash"]], "hash_bytes() (data_juicer.utils.fingerprint_utils.hasher class method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.hash_bytes"]], "hash_default() (data_juicer.utils.fingerprint_utils.hasher class method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.hash_default"]], "hexdigest() (data_juicer.utils.fingerprint_utils.hasher method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.hexdigest"]], "is_absolute_path() (in module data_juicer.utils.file_utils)": [[12, "data_juicer.utils.file_utils.is_absolute_path"]], "lang (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.lang"]], "lang_score (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.lang_score"]], "lines (data_juicer.utils.constant.intervars attribute)": [[12, "data_juicer.utils.constant.InterVars.lines"]], "list() (data_juicer.utils.registry.registry method)": [[12, "data_juicer.utils.registry.Registry.list"]], "load_ckpt() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.load_ckpt"]], "load_words_asset() (in module data_juicer.utils.asset_utils)": [[12, "data_juicer.utils.asset_utils.load_words_asset"]], "max_line_length (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.max_line_length"]], "meta (data_juicer.utils.constant.fields attribute)": [[12, "data_juicer.utils.constant.Fields.meta"]], "minhash (data_juicer.utils.constant.hashkeys attribute)": [[12, "data_juicer.utils.constant.HashKeys.minhash"]], "modules (data_juicer.utils.registry.registry property)": [[12, "data_juicer.utils.registry.Registry.modules"]], "name (data_juicer.utils.registry.registry property)": [[12, "data_juicer.utils.registry.Registry.name"]], "num_token (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.num_token"]], "num_words (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.num_words"]], "perplexity (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.perplexity"]], "prepare_diversity_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_diversity_model"]], "prepare_fasttext_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_fasttext_model"]], "prepare_huggingface_tokenizer() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_huggingface_tokenizer"]], "prepare_kenlm_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_kenlm_model"]], "prepare_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_model"]], "prepare_nltk_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_nltk_model"]], "prepare_sentencepiece_model() (in module data_juicer.utils.model_utils)": [[12, "data_juicer.utils.model_utils.prepare_sentencepiece_model"]], "record() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.record"]], "redirect_sys_output() (in module data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.redirect_sys_output"]], "refined_words (data_juicer.utils.constant.intervars attribute)": [[12, "data_juicer.utils.constant.InterVars.refined_words"]], "register_module() (data_juicer.utils.registry.registry method)": [[12, "data_juicer.utils.registry.Registry.register_module"]], "save_ckpt() (data_juicer.utils.ckpt_utils.checkpointmanager method)": [[12, "data_juicer.utils.ckpt_utils.CheckpointManager.save_ckpt"]], "setup_logger() (in module data_juicer.utils.logger_utils)": [[12, "data_juicer.utils.logger_utils.setup_logger"]], "simhash (data_juicer.utils.constant.hashkeys attribute)": [[12, "data_juicer.utils.constant.HashKeys.simhash"]], "special_char_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.special_char_ratio"]], "stats (data_juicer.utils.constant.fields attribute)": [[12, "data_juicer.utils.constant.Fields.stats"]], "stopwords_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.stopwords_ratio"]], "suffix (data_juicer.utils.constant.fields attribute)": [[12, "data_juicer.utils.constant.Fields.suffix"]], "text_len (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.text_len"]], "update() (data_juicer.utils.fingerprint_utils.hasher method)": [[12, "data_juicer.utils.fingerprint_utils.Hasher.update"]], "update_fingerprint() (in module data_juicer.utils.fingerprint_utils)": [[12, "data_juicer.utils.fingerprint_utils.update_fingerprint"]], "word_rep_ratio (data_juicer.utils.constant.statskeys attribute)": [[12, "data_juicer.utils.constant.StatsKeys.word_rep_ratio"]], "words (data_juicer.utils.constant.intervars attribute)": [[12, "data_juicer.utils.constant.InterVars.words"]], "write() (data_juicer.utils.logger_utils.streamtologuru method)": [[12, "data_juicer.utils.logger_utils.StreamToLoguru.write"]]}}) \ No newline at end of file