diff --git a/.doctrees/data_juicer.analysis.doctree b/.doctrees/data_juicer.analysis.doctree deleted file mode 100644 index 83688c6f2..000000000 Binary files a/.doctrees/data_juicer.analysis.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.config.doctree b/.doctrees/data_juicer.config.doctree deleted file mode 100644 index 8ec06e4e6..000000000 Binary files a/.doctrees/data_juicer.config.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.core.doctree b/.doctrees/data_juicer.core.doctree deleted file mode 100644 index 5897f2495..000000000 Binary files a/.doctrees/data_juicer.core.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.doctree b/.doctrees/data_juicer.doctree deleted file mode 100644 index 068cc9084..000000000 Binary files a/.doctrees/data_juicer.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.format.doctree b/.doctrees/data_juicer.format.doctree deleted file mode 100644 index 2fcaef22d..000000000 Binary files a/.doctrees/data_juicer.format.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.ops.common.doctree b/.doctrees/data_juicer.ops.common.doctree deleted file mode 100644 index afd5f0b87..000000000 Binary files a/.doctrees/data_juicer.ops.common.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.ops.deduplicator.doctree b/.doctrees/data_juicer.ops.deduplicator.doctree deleted file mode 100644 index 06e634384..000000000 Binary files a/.doctrees/data_juicer.ops.deduplicator.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.ops.doctree b/.doctrees/data_juicer.ops.doctree deleted file mode 100644 index 1895ebcb0..000000000 Binary files a/.doctrees/data_juicer.ops.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.ops.filter.doctree b/.doctrees/data_juicer.ops.filter.doctree deleted file mode 100644 index 73d9f8361..000000000 Binary files a/.doctrees/data_juicer.ops.filter.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.ops.mapper.doctree b/.doctrees/data_juicer.ops.mapper.doctree deleted file mode 100644 index 4a6395cb8..000000000 Binary files a/.doctrees/data_juicer.ops.mapper.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.ops.selector.doctree b/.doctrees/data_juicer.ops.selector.doctree deleted file mode 100644 index 9e4cd16b3..000000000 Binary files a/.doctrees/data_juicer.ops.selector.doctree and /dev/null differ diff --git a/.doctrees/data_juicer.utils.doctree b/.doctrees/data_juicer.utils.doctree deleted file mode 100644 index 6482d5580..000000000 Binary files a/.doctrees/data_juicer.utils.doctree and /dev/null differ diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle index eaf99719e..1db72595d 100644 Binary files a/.doctrees/environment.pickle and b/.doctrees/environment.pickle differ diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree index bb7231658..1ec9634a1 100644 Binary files a/.doctrees/index.doctree and b/.doctrees/index.doctree differ diff --git a/.doctrees/modules.doctree b/.doctrees/modules.doctree index 46c885473..7b5193c75 100644 Binary files a/.doctrees/modules.doctree and b/.doctrees/modules.doctree differ diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html deleted file mode 100644 index 2705eea87..000000000 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ /dev/null @@ -1,389 +0,0 @@ - - - - - - data_juicer.analysis.column_wise_analysis — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.analysis.column_wise_analysis

-import math
-import os
-
-import matplotlib.pyplot as plt
-import pandas as pd
-
-from data_juicer.utils.constant import Fields
-
-from .overall_analysis import OverallAnalysis
-
-
-
[docs]def get_row_col(total_num, factor=2): - """ - Given the total number of stats figures, get the "best" number of rows and - columns. This function is needed when we need to store all stats figures - into one image. - - :param total_num: Total number of stats figures - :param factor: Number of sub-figure types in each figure. In - default, it's 2, which means there are histogram and box plot - for each stat figure - :return: "best" number of rows and columns, and the grid list - """ - n = total_num * factor # actual number of figures - now_col = factor # search from the minimum number of columns - now_row = total_num - for col in range(factor, n + 1, factor): - row = n * 1.0 / col - if row != int(row): # skip non-integer results - continue - if col > row: - # object: minimum the difference between number of columns and rows - if abs(col - row) > abs(now_col - now_row): - break - else: - now_row = row - now_col = col - break - now_row = row - now_col = col - - # different sub-figures of the same stats should be in the same row - now_col = now_col // factor - - # get grid indexes - grids = [] - for i in range(total_num): - grids.append((i // now_col, i % now_col)) - - return int(now_row), int(now_col), grids
- - -
[docs]class ColumnWiseAnalysis: - """Apply analysis on each column of stats respectively.""" - - def __init__(self, - dataset, - output_path, - overall_result=None, - save_stats_in_one_file=True): - """ - Initialization method - :param dataset: the dataset to be analysed - :param output_path: path to store the analysis results - :param overall_result: optional precomputed overall stats result - :param save_stats_in_one_file: whether save all analysis figures of all - stats into one image file - """ - self.stats = pd.DataFrame(dataset[Fields.stats]) - self.output_path = output_path - if not os.path.exists(self.output_path): - os.makedirs(self.output_path) - - # if no overall description provided, analyse it from scratch - if overall_result is None: - oa = OverallAnalysis(dataset, output_path) - overall_result = oa.analyse() - self.overall_result = overall_result - - self.save_stats_in_one_file = save_stats_in_one_file - -
[docs] def analyse(self, show_percentiles=False, show=False): - """ - Apply analysis and draw the analysis figure for stats. - - :param show_percentiles: whether to show the percentile line in - each sub-figure. If it's true, there will be several red - lines to indicate the quantiles of the stats distributions - :param show: whether to show in a single window after drawing - :return: - """ - # number of sub-figures for each stat. There are histogram and box plot - # for now, so it's 2. - num_subcol = 2 - - # Default width and height unit for each sub-figure - width_unit = 4 - height_unit = 6 - - columns = self.stats.columns - num = len(columns) - - # get the recommended "best" number of columns and rows - rec_row, rec_col, grid_indexes = get_row_col(num, num_subcol) - - if self.save_stats_in_one_file: - # if save_stats_in_one_file is opened, use recommended "best" - # number of columns and rows to initialize the image panel. - rec_width = rec_col * num_subcol * width_unit - rec_height = rec_row * height_unit - fig = plt.figure(figsize=(rec_width, rec_height), - layout='constrained') - subfigs = fig.subfigures(rec_row, rec_col, wspace=0.01) - for i, column_name in enumerate(columns): - data = self.stats[column_name] - grid = grid_indexes[i] - if self.save_stats_in_one_file: - if rec_col == 1: - grid = grid[0] - elif rec_row == 1: - grid = grid[1] - - if rec_col == 1 and rec_row == 1: - subfig = subfigs - else: - subfig = subfigs[grid] - subfig.set_facecolor('0.85') - - # numeric or string via nan. Apply different plot method for them. - if pd.isna(self.overall_result[column_name].get('top')): - # numeric -- draw histogram and box plot for this stat - percentiles = self.overall_result[column_name] \ - if show_percentiles else None - - # get axes for each subplot - if self.save_stats_in_one_file: - axes = subfig.subplots(1, num_subcol) - else: - axes = [None] * num_subcol - - # draw histogram - self.draw_hist(axes[0], - data, - os.path.join(self.output_path, - f'{column_name}-hist.png'), - percentiles=percentiles) - - # draw box - self.draw_box(axes[1], - data, - os.path.join(self.output_path, - f'{column_name}-box.png'), - percentiles=percentiles) - else: - # object (string) -- only draw histogram for this stat - if self.save_stats_in_one_file: - axes = subfig.subplots(1, 1) - else: - axes = None - - self.draw_hist( - axes, data, - os.path.join(self.output_path, f'{column_name}-hist.png')) - - # add a title to the figure of this stat - if self.save_stats_in_one_file: - subfig.suptitle(f'{data.name}', - fontsize='x-large', - fontweight='bold') - - if self.save_stats_in_one_file: - fig = plt.gcf() - fig.savefig(os.path.join(self.output_path, 'all-stats.png')) - if show: - plt.show() - else: - pass - # TODO: (fixme) the saved png sometime are blank - plt.clf()
- -
[docs] def draw_hist(self, ax, data, save_path, percentiles=None, show=False): - """ - Draw the histogram for the data. - - :param ax: the axes to draw - :param data: data to draw - :param save_path: the path to save the histogram figure - :param percentiles: the overall analysis result of the data - including percentile information - :param show: whether to show in a single window after drawing - :return: - """ - # recommended number of bins - data_num = len(data) - if data_num >= 100: - rec_bins = int(math.sqrt(len(data))) - else: - rec_bins = None - - # if ax is None, using plot method in pandas - if ax is None: - ax = data.hist(bins=rec_bins, figsize=(20, 16)) - else: - ax.hist(data, bins=rec_bins) - - # set axes - ax.set_xlabel(data.name) - ax.set_ylabel('Count') - - # draw percentile lines if it's not None - if percentiles is not None: - ymin, ymax = ax.get_ylim() - for percentile in percentiles.keys(): - # skip other information - if percentile in {'count', 'unique', 'top', 'freq', 'std'}: - continue - value = percentiles[percentile] - - ax.vlines(x=value, ymin=ymin, ymax=ymax, colors='r') - ax.text(x=value, y=ymax, s=percentile, rotation=30, color='r') - ax.text(x=value, - y=ymax * 0.97, - s=str(round(value, 3)), - rotation=30, - color='r') - - if not self.save_stats_in_one_file: - # save into file - plt.savefig(save_path) - - if show: - plt.show() - else: - # if no showing, we need to clear this axes to avoid - # accumulated overlapped figures in different draw_xxx function - # calling - ax.clear() - else: - # add a little rotation on labels of x axis to avoid overlapping - ax.tick_params(axis='x', rotation=25)
- -
[docs] def draw_box(self, ax, data, save_path, percentiles=None, show=False): - """ - Draw the box plot for the data. - - :param ax: the axes to draw - :param data: data to draw - :param save_path: the path to save the box figure - :param percentiles: the overall analysis result of the data - including percentile information - :param show: whether to show in a single window after drawing - :return: - """ - # if ax is None, using plot method in pandas - if ax is None: - ax = data.plot.box(figsize=(20, 16)) - else: - ax.boxplot(data) - - # set axes - ax.set_ylabel(data.name) - - # draw percentile lines if it's not None - if percentiles is not None: - xmin, xmax = ax.get_xlim() - for percentile in percentiles.keys(): - # skip other information - if percentile in {'count', 'unique', 'top', 'freq', 'std'}: - continue - value = percentiles[percentile] - - ax.hlines(y=value, xmin=xmin, xmax=xmax, colors='r') - ax.text(y=value, - x=xmin + (xmax - xmin) * 0.6, - s=f'{percentile}: {round(value, 3)}', - color='r') - - if not self.save_stats_in_one_file: - # save into file - plt.savefig(save_path) - - if show: - plt.show() - else: - # if no showing, we need to clear this axes to avoid - # accumulated overlapped figures in different draw_xxx function - # calling - ax.clear()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html deleted file mode 100644 index f55b7d108..000000000 --- a/_modules/data_juicer/analysis/diversity_analysis.html +++ /dev/null @@ -1,259 +0,0 @@ - - - - - - data_juicer.analysis.diversity_analysis — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.analysis.diversity_analysis

-import os
-
-import pandas as pd
-import spacy
-from loguru import logger
-
-from data_juicer.utils.model_utils import MODEL_ZOO, prepare_model
-
-
-# Modify from self_instruct, please refer to
-# https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb
-
[docs]def find_root_verb_and_its_dobj(tree_root): - """ - Find the verb and its object closest to the root. - - :param tree_root: the root of lexical tree - :return: valid verb and its object. - """ - # first check if the current node and its children satisfy the condition - if tree_root.pos_ == 'VERB': - for child in tree_root.children: - if child.dep_ == 'dobj' and child.pos_ == 'NOUN': - return tree_root.lemma_ if len( - tree_root.lemma_) else tree_root.text, child.lemma_ if len( - child.lemma_) else child.text - return tree_root.lemma_ if len( - tree_root.lemma_) else tree_root.text, None - # if not, check its children - for child in tree_root.children: - return find_root_verb_and_its_dobj(child) - # if no children satisfy the condition, return None - return None, None
- - -# Modify from self_instruct, please refer to -# https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb -
[docs]def find_root_verb_and_its_dobj_in_string(nlp, s, first_sent=True): - """ - Find the verb and its object closest to the root of lexical tree of input - string. - - :param nlp: the diversity model to analyse the diversity strings - :param s: the string to be analysed - :param first_sent: whether to analyse the first sentence in the - input string only. If it's true, return the analysis result of - the first sentence no matter it's valid or not. If it's false, - return the first valid result over all sentences - :return: valid verb and its object of this string - """ - doc = nlp(s) - for sent in doc.sents: - verb, noun = find_root_verb_and_its_dobj(sent.root) - if first_sent or (verb is not None and noun is not None): - return verb, noun - return None, None
- - -
[docs]def get_diversity(dataset, top_k_verbs=20, top_k_nouns=4, **kwargs): - """ - Given the lexical tree analysis result, return the diversity results. - - :param dataset: lexical tree analysis result - :param top_k_verbs: only keep the top_k_verbs largest verb groups - :param top_k_nouns: only keep the top_k_nouns largest noun groups - for each verb group - :param kwargs: extra args - :return: the diversity results - """ - phrases = pd.DataFrame(dataset).dropna() - logger.info(f'find valid verb-noun structure \ - {phrases.shape[0]} of {dataset.shape[0]}') - top_verbs = phrases.groupby(['verb' - ]).size().nlargest(top_k_verbs).reset_index() - - df = phrases[phrases['verb'].isin(top_verbs['verb'].tolist())] - df = df.groupby(['verb', 'noun']).size().reset_index().rename(columns={ - 0: 'count' - }).sort_values(by=['count'], ascending=False) - - df = df.groupby('verb').apply(lambda x: x.sort_values( - 'count', ascending=False).head(top_k_nouns)).reset_index(drop=True) - return df
- - -
[docs]class DiversityAnalysis: - """Apply diversity analysis for each sample and get an overall analysis - result.""" - - def __init__(self, dataset, output_path, lang_or_model='en'): - """Initialization method :param dataset: the dataset to be analysed - :param output_path: path to store the analysis results :param - lang_or_model: the diversity model or a specific language used to load - the diversity model.""" - - self.dataset = dataset - self.output_path = output_path - if not os.path.exists(self.output_path): - os.makedirs(self.output_path) - self.lang_or_model = lang_or_model - -
[docs] def compute(self, lang_or_model=None, column_name='text'): - """ - Apply lexical tree analysis on each sample. - - :param lang_or_model: the diversity model or a specific language - used to load the diversity model - :param column_name: the name of column to be analysed - :return: the analysis result. - """ - # load diversity model - lang_or_model = lang_or_model if lang_or_model else self.lang_or_model - if isinstance(lang_or_model, str): - diversity_model = MODEL_ZOO.get( - prepare_model(lang_or_model, 'spacy')) - else: - diversity_model = lang_or_model - - assert isinstance(diversity_model, spacy.Language) - - def find_verb_noun(sample): - try: - verb, noun = find_root_verb_and_its_dobj_in_string( - diversity_model, sample[column_name]) - except Exception as e: - print(str(e)) - verb, noun = None, None - return {'verb': verb, 'noun': noun} - - dataset = self.dataset.map(find_verb_noun) - return pd.DataFrame(dataset)
- -
[docs] def analyse(self, - lang_or_model=None, - column_name='text', - postproc_func=get_diversity, - **postproc_kwarg): - """ - Apply diversity analysis on the whole dataset. - - :param lang_or_model: the diversity model or a specific language - used to load the diversity model - :param column_name: the name of column to be analysed - :param postproc_func: function to analyse diversity. In default, - it's function get_diversity - :param postproc_kwarg: arguments of the postproc_func - :return: - """ - # get the lexical tree analysis result - raw_df = self.compute(lang_or_model=lang_or_model, - column_name=column_name) - # get the result of diversity analysis - df = postproc_func(raw_df, **postproc_kwarg) - - # export to result report file - df.to_csv(os.path.join(self.output_path, 'diversity.csv')) - df.to_markdown(os.path.join(self.output_path, 'diversity.md')) - - return df
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html deleted file mode 100644 index 3d390b7e8..000000000 --- a/_modules/data_juicer/analysis/overall_analysis.html +++ /dev/null @@ -1,142 +0,0 @@ - - - - - - data_juicer.analysis.overall_analysis — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.analysis.overall_analysis

-import os
-
-import pandas as pd
-
-from data_juicer.utils.constant import Fields
-
[docs]class OverallAnalysis: - """Apply analysis on the overall stats, including mean, std, quantiles, - etc.""" - - def __init__(self, dataset, output_path): - """ - Initialization method. - - :param dataset: the dataset to be analysed - :param output_path: path to store the analysis results. - """ - self.stats = pd.DataFrame(dataset[Fields.stats]) - self.output_path = output_path - if not os.path.exists(self.output_path): - os.makedirs(self.output_path) - - # default percentiles to analyse - self.default_percentiles = [0.25, 0.5, 0.75] - -
[docs] def analyse(self, percentiles=[]): - """ - Apply overall analysis on the whole dataset based on the describe - method of pandas. - - :param percentiles: percentiles to analyse - :return: the overall analysis result. - """ - # merge default and customized percentiles and get overall information - percentiles = list(set(percentiles + self.default_percentiles)) - overall = self.stats.describe(percentiles=percentiles, include='all') - - # export to result report file - overall.to_csv(os.path.join(self.output_path, 'overall.csv')) - overall.to_markdown(os.path.join(self.output_path, 'overall.md')) - - return overall
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/config/config.html b/_modules/data_juicer/config/config.html deleted file mode 100644 index 986cd39be..000000000 --- a/_modules/data_juicer/config/config.html +++ /dev/null @@ -1,511 +0,0 @@ - - - - - - data_juicer.config.config — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.config.config

-import os
-import shutil
-import time
-from argparse import ArgumentError
-from typing import Dict, List, Tuple, Union
-
-from jsonargparse import (ActionConfigFile, ArgumentParser, dict_to_namespace,
-                          namespace_to_dict)
-from jsonargparse.typing import NonNegativeInt, PositiveInt
-from loguru import logger
-
-from data_juicer.ops.base_op import OPERATORS
-from data_juicer.utils.logger_utils import setup_logger
-
-
-
[docs]def init_configs(args=None): - """ - initialize the jsonargparse parser and parse configs from one of: - 1. POSIX-style commands line args; - 2. config files in yaml (json and jsonnet supersets); - 3. environment variables - 4. hard-coded defaults - - :param args: list of params, e.g., ['--conifg', 'cfg.yaml'], defaut None. - :return: a global cfg object used by the Executor or Analyser - """ - parser = ArgumentParser(default_env=True, default_config_files=None) - - parser.add_argument( - '--config', - action=ActionConfigFile, - help='Path to a configuration file.', - required=True) - - # basic global paras with extended type hints - # e.g., files can be mode include flags - # "fr": "path to a file that exists and is readable") - # "fc": "path to a file that can be created if it does not exist") - # "dw": "path to a directory that exists and is writeable") - # "dc": "path to a directory that can be created if it does not exist") - # "drw": "path to a directory that exists and is readable and writeable") - parser.add_argument( - '--project_name', - type=str, - default='hello_world', - help='Name of your data process project.') - parser.add_argument( - '--executor_type', - type=str, - default='default', - choices=['default', 'ray'], - help='Type of executor, support "default" or "ray" for now.' - ) - parser.add_argument( - '--dataset_path', - type=str, - help='Path to datasets with optional weights(0.0-1.0), 1.0 as ' - 'default. Accepted format:<w1> dataset1-path <w2> dataset2-path ' - '<w3> dataset3-path ...') - parser.add_argument( - '--export_path', - type=str, - default='./outputs/hello_world.jsonl', - help='Path to export and save the output processed dataset. The ' - 'directory to store the processed dataset will be the work ' - 'directory of this process.') - parser.add_argument( - '--export_shard_size', - type=NonNegativeInt, - default=0, - help='Shard size of exported dataset in Byte. In default, it\'s 0, ' - 'which means export the whole dataset into only one file. If ' - 'it\'s set a positive number, the exported dataset will be split ' - 'into several sub-dataset shards, and the max size of each shard ' - 'won\'t larger than the export_shard_size') - parser.add_argument( - '--export_in_parallel', - type=bool, - default=False, - help='Whether to export the result dataset in parallel to a single ' - 'file, which usually takes less time. It only works when ' - 'export_shard_size is 0, and its default number of processes is ' - 'the same as the argument np. **Notice**: If it\'s True, ' - 'sometimes exporting in parallel might require much more time ' - 'due to the IO blocking, especially for very large datasets. ' - 'When this happens, False is a better choice, although it takes ' - 'more time.') - parser.add_argument( - '--np', - type=PositiveInt, - default=4, - help='Number of processes to process dataset.') - parser.add_argument( - '--text_keys', - type=Union[str, List[str]], - default='text', - help='Key name of field where the sample texts to be processed, e.g., ' - '`text`, `text.instruction`, `text.output`, ... Note: currently, ' - 'we support specify only ONE key for each op, for cases ' - 'requiring multiple keys, users can specify the op multiple ' - 'times. We will only use the first key of `text_keys` when you ' - 'set multiple keys.') - parser.add_argument( - '--suffixes', - type=Union[str, List[str], Tuple[str]], - default=[], - help='Suffixes of files that will be find and loaded. If not set, we ' - 'will find all suffix files, and select a suitable formatter ' - 'with the most files as default.') - parser.add_argument( - '--use_cache', - type=bool, - default=True, - help='Whether to use the cache management of huggingface datasets. It ' - 'might take up lots of disk space when using cache') - parser.add_argument( - '--ds_cache_dir', - type=str, - default='~/.cache/huggingface/datasets', - help='Cache dir for HuggingFace datasets. In default it\'s the ' - 'default cache dir "~/.cache/huggingface/datasets". If this ' - 'argument is reset by users, it will override the default cache ' - 'dir.') - parser.add_argument( - '--cache_compress', - type=str, - default=None, - help='The compression method of the cache file, which can be' - 'specified in ["gzip", "zstd", "lz4"]. If this parameter is' - 'None, the cache file will not be compressed.') - parser.add_argument( - '--use_checkpoint', - type=bool, - default=False, - help='Whether to use the checkpoint management to save the latest ' - 'version of dataset to work dir when processing. Rerun the same ' - 'config will reload the checkpoint and skip ops before it. Cache ' - 'will be disabled when it is true . If args of ops before the ' - 'checkpoint are changed, all ops will be rerun from the ' - 'beginning.') - parser.add_argument( - '--temp_dir', - type=str, - default=None, - help='Path to the temp directory to store intermediate caches when ' - 'cache is disabled. In default it\'s None, so the temp dir will ' - 'be specified by system. NOTICE: you should be caution when ' - 'setting this argument because it might cause unexpected program ' - 'behaviors when this path is set to an unsafe directory.') - parser.add_argument( - '--open_tracer', - type=bool, - default=False, - help='Whether to open the tracer to trace samples changed during ' - 'process. It might take more time when opening tracer.') - parser.add_argument( - '--op_list_to_trace', - type=List[str], - default=[], - help='Which ops will be traced by tracer. If it\'s empty, all ops in ' - 'cfg.process will be traced. Only available when open_tracer is ' - 'true.') - parser.add_argument( - '--trace_num', - type=int, - default=10, - help='Number of samples extracted by tracer to show the dataset ' - 'difference before and after a op. Only available when ' - 'open_tracer is true.') - parser.add_argument( - '--op_fusion', - type=bool, - default=False, - help='Whether to fuse operators that share the same intermediate ' - 'variables automatically. Op fusion might reduce the memory ' - 'requirements slightly but speed up the whole process.') - parser.add_argument( - '--process', - type=List[Dict], - help='List of several operators with their arguments, these ops will ' - 'be applied to dataset in order') - parser.add_argument( - '--save_stats_in_one_file', - type=bool, - default=False, - help='Whether to save all stats to only one file. Only used in ' - 'Analysis.') - parser.add_argument( - '--ray_address', - type=str, - default='auto', - help='The address of the Ray cluster.' - ) - - # add all parameters of the registered ops class to the parser, - # and these op parameters can be modified through the command line, - ops_sorted_by_types = sort_op_by_types_and_names(OPERATORS.modules.items()) - _collect_config_info_from_class_docs(ops_sorted_by_types, parser) - - try: - cfg = parser.parse_args(args=args) - option_in_commands = [ - ''.join(arg.split('--')[1].split('.')[0]) for arg in parser.args - if '--' in arg and 'config' not in arg - ] - - full_option_in_commands = list( - set([ - ''.join(arg.split('--')[1].split('=')[0]) - for arg in parser.args if '--' in arg and 'config' not in arg - ])) - - if cfg.process is None: - cfg.process = [] - - # check and update every op params in `cfg.process` - # e.g. - # `python demo.py --config demo.yaml - # --language_id_score_filter.lang en` - for i, op_in_process in enumerate(cfg.process): - op_in_process_name = list(op_in_process.keys())[0] - - temp_cfg = cfg - if op_in_process_name not in option_in_commands: - - # update op params to temp cfg if set - if op_in_process[op_in_process_name]: - temp_cfg = parser.merge_config( - dict_to_namespace(op_in_process), cfg) - else: - - # args in the command line override the ones in `cfg.process` - for full_option_in_command in full_option_in_commands: - - key = full_option_in_command.split('.')[1] - if op_in_process[ - op_in_process_name] and key in op_in_process[ - op_in_process_name].keys(): - op_in_process[op_in_process_name].pop(key) - - if op_in_process[op_in_process_name]: - temp_cfg = parser.merge_config( - dict_to_namespace(op_in_process), temp_cfg) - - # update op params of cfg.process - internal_op_para = temp_cfg.get(op_in_process_name) - - cfg.process[i] = { - op_in_process_name: - None if internal_op_para is None else - namespace_to_dict(internal_op_para) - } - - cfg = init_setup_from_cfg(cfg) - - # copy the config file into the work directory - config_backup(cfg) - - # show the final config tables before the process started - display_config(cfg) - - return cfg - except ArgumentError: - logger.error('Config initialization failed')
- - -
[docs]def init_setup_from_cfg(cfg): - """ - Do some extra setup tasks after parsing config file or command line. - - 1. create working directory and a log directory - 2. update cache directory - 3. update checkpoint and `temp_dir` of tempfile - - :param cfg: a original cfg - :param cfg: a updated cfg - """ - - export_path = cfg.export_path - cfg.work_dir = os.path.dirname(export_path) - log_dir = os.path.join(cfg.work_dir, 'log') - if not os.path.exists(log_dir): - os.makedirs(log_dir, exist_ok=True) - timestamp = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) - cfg.timestamp = timestamp - logfile_name = timestamp + '.txt' - setup_logger(save_dir=log_dir, filename=logfile_name, redirect=cfg.executor_type=='default') - - # whether or not to use cache management - # disabling the cache or using checkpoint explicitly will turn off the - # cache management. - if not cfg.use_cache or cfg.use_checkpoint: - logger.warning('Cache management of datasets is disabled.') - from datasets import disable_caching - disable_caching() - cfg.use_cache = False - - # disabled cache compression when cache is disabled - if cfg.cache_compress: - logger.warning('Disable cache compression due to disabled cache.') - cfg.cache_compress = None - - # when disabling cache, enable the temp_dir argument - logger.warning(f'Set temp directory to store temp files to ' - f'[{cfg.temp_dir}].') - import tempfile - if cfg.temp_dir is not None and not os.path.exists(cfg.temp_dir): - os.makedirs(cfg.temp_dir, exist_ok=True) - tempfile.tempdir = cfg.temp_dir - - # The checkpoint mode is not compatible with op fusion for now. - if cfg.op_fusion: - cfg.use_checkpoint = False - - # reset huggingface datasets cache directory - from datasets import config - config.HF_DATASETS_CACHE = cfg.ds_cache_dir - - # if there is suffix_filter op, turn on the add_suffix flag - cfg.add_suffix = False - for op in cfg.process: - op_name, _ = list(op.items())[0] - if op_name == 'suffix_filter': - cfg.add_suffix = True - break - - # Apply text_key modification during initializing configs - # users can freely specify text_key for different ops using `text_key` - # otherwise, set arg text_key of each op to text_keys - if isinstance(cfg.text_keys, list): - text_key = cfg.text_keys[0] - else: - text_key = cfg.text_keys - for op in cfg.process: - for op_name in op: - args = op[op_name] - if args is None: - args = {'text_key': text_key} - elif args['text_key'] is None: - args['text_key'] = text_key - op[op_name] = args - - return cfg
- - -def _collect_config_info_from_class_docs(configurable_ops, parser): - """ - Add ops and its params to parser for command line. - - :param configurable_ops: a list of ops to be to added, each item is - a pair of op_name and op_class - :param parser: jsonargparse parser need to update - """ - - for op_name, op_class in configurable_ops: - parser.add_class_arguments( - theclass=op_class, - nested_key=op_name, - fail_untyped=False, - instantiate=False, - ) - - -
[docs]def sort_op_by_types_and_names(op_name_classes): - """ - Split ops items by op type and sort them to sub-ops by name, then concat - together. - - :param op_name_classes: a list of op modules - :return: sorted op list , each item is a pair of op_name and - op_class - """ - - mapper_ops = [(name, c) for (name, c) in op_name_classes - if 'mapper' in name] - filter_ops = [(name, c) for (name, c) in op_name_classes - if 'filter' in name] - deduplicator_ops = [(name, c) for (name, c) in op_name_classes - if 'deduplicator' in name] - selector_ops = [(name, c) for (name, c) in op_name_classes - if 'selector' in name] - ops_sorted_by_types = sorted(mapper_ops) + sorted(filter_ops) + sorted( - deduplicator_ops) + sorted(selector_ops) - return ops_sorted_by_types
- -
[docs]def config_backup(cfg): - cfg_path = cfg.config[0].absolute - work_dir = cfg.work_dir - target_path = os.path.join(work_dir, os.path.basename(cfg_path)) - logger.info(f'Back up the input config file [{cfg_path}] into the ' - f'work_dir [{work_dir}]') - shutil.copyfile(cfg_path, target_path)
- -
[docs]def display_config(cfg): - from tabulate import tabulate - import pprint - table_header = ['key', 'values'] - - # remove ops outside the process list for better displaying - shown_cfg = cfg.clone() - for op in OPERATORS.modules.keys(): - _ = shown_cfg.pop(op) - - # construct the table as 2 columns - config_table = [(k, pprint.pformat(v, compact=True)) - for k, v in shown_cfg.items()] - table = tabulate(config_table, headers=table_header, tablefmt='fancy_grid') - - logger.info('Configuration table: ') - print(table)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/analyser.html b/_modules/data_juicer/core/analyser.html deleted file mode 100644 index 474a28777..000000000 --- a/_modules/data_juicer/core/analyser.html +++ /dev/null @@ -1,230 +0,0 @@ - - - - - - data_juicer.core.analyser — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.analyser

-import os
-
-from loguru import logger
-
-from data_juicer.analysis import ColumnWiseAnalysis, OverallAnalysis
-from data_juicer.config import init_configs
-from data_juicer.format import load_formatter
-from data_juicer.ops import Filter, load_ops
-from data_juicer.utils import cache_utils
-from data_juicer.utils.constant import Fields
-
-from .exporter import Exporter
-
-
-
[docs]class Analyser: - """ - This Analyser class is used to analyse a specific dataset. - - It will compute stats for all filter ops in the config file, apply - multiple analysis (e.g. OverallAnalysis, ColumnWiseAnalysis, etc.) - on these stats, and generate the analysis results (stats tables, - distribution figures, etc.) to help users understand the input - dataset better. - """ - - def __init__(self, cfg=None): - """ - Initialization method. - - :param cfg: optional config dict. - """ - self.cfg = init_configs() if cfg is None else cfg - - self.work_dir = self.cfg.work_dir - self.ops = None - - if self.cfg.use_cache: - logger.info(f'Using cache compression method: ' - f'[{self.cfg.cache_compress}]') - cache_utils.CACHE_COMPRESS = self.cfg.cache_compress - - # setup formatter - logger.info('Setting up data formatter...') - self.formatter = load_formatter(self.cfg.dataset_path, - self.cfg.text_keys, self.cfg.suffixes, - self.cfg.add_suffix) - - # prepare exporter and check export path suffix - # NOTICE: no need to export dataset texts for analyser - # (export_ds=False). Instead, only need to export stats - # (export_stats=True). - logger.info('Preparing exporter...') - self.exporter = Exporter(self.cfg.export_path, - self.cfg.export_shard_size, - self.cfg.export_in_parallel, - self.cfg.np, - export_ds=False, - export_stats=True) - - # parsed_res - self.overall_result = None - self.overall_single_plot_path = None - self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis') - -
[docs] def run(self, load_data_np=None): - """ - Running the dataset analysis pipeline. - - :param load_data_np: number of workers when loading the dataset. - :return: analysed dataset. - """ - # 1. format data - logger.info('Loading dataset from data formatter...') - if load_data_np is None: - load_data_np = self.cfg.np - dataset = self.formatter.load_dataset(load_data_np) - - # extract processes - logger.info('Preparing process operators...') - self.cfg.process, self.ops = load_ops(self.cfg.process, - self.cfg.op_fusion) - - # 2. stats precompute only for filter ops - logger.info('Computing the stats of dataset...') - stats_collected = False - for op_cfg, op in zip(self.cfg.process, self.ops): - op_name = list(op_cfg.keys())[0] - if isinstance(op, Filter): - if Fields.stats not in dataset.features: - # TODO: - # this is a temp solution, - # only add stats when calling filter op - dataset = dataset.add_column(name=Fields.stats, - column=[{}] * - dataset.num_rows) - dataset = dataset.map(op.compute_stats, - num_proc=self.cfg.np, - desc=op_name + '_compute_stats') - stats_collected = True - if not stats_collected: - logger.warning('No stats collected. Please add some Filter ops to ' - 'the process list in configs.') - return dataset - - # 3. analysis and output result to the export path - # 3.1. Only consider fields in Fields.stats - # 3.2. For string fields, only consider its histogram - # 3.3. For numeric fields, consider its histogram and box - # 3.4. Otherwise, DO NOT analyse - - logger.info('Applying overall analysis on stats...') - overall_analysis = OverallAnalysis(dataset, self.analysis_path) - self.overall_result = overall_analysis.analyse() - - logger.info('Applying column-wise analysis on stats...') - column_wise_analysis = ColumnWiseAnalysis( - dataset, - self.analysis_path, - overall_result=self.overall_result, - save_stats_in_one_file=self.cfg.save_stats_in_one_file) - column_wise_analysis.analyse() - - # 4. data export - logger.info('Exporting dataset to disk...') - self.exporter.export(dataset) - if self.cfg.use_cache and self.cfg.cache_compress: - from data_juicer.utils.compress import compress - compress(dataset) - return dataset
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html deleted file mode 100644 index d1068e650..000000000 --- a/_modules/data_juicer/core/data.html +++ /dev/null @@ -1,423 +0,0 @@ - - - - - - data_juicer.core.data — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.data

-import copy
-import inspect
-from functools import wraps
-from typing import Union
-
-from datasets import Dataset, DatasetDict, is_caching_enabled
-from datasets.formatting.formatting import LazyBatch
-from loguru import logger
-
-from data_juicer.utils import cache_utils
-from data_juicer.utils.compress import (cleanup_compressed_cache_files,
-                                        compress, decompress, CompressionOff)
-from data_juicer.utils.fingerprint_utils import generate_fingerprint
-
-
-
[docs]def wrap_func_with_nested_access(f): - """ - Before conducting actual function `f`, wrap its args and kargs into nested - ones. - - :param f: function to be wrapped. - :return: wrapped function - """ - - def wrap_nested_structure(*args, **kargs): - wrapped_args = [nested_obj_factory(arg) for arg in args] - wrapped_kargs = { - k: nested_obj_factory(arg) - for k, arg in kargs.items() - } - return wrapped_args, nested_obj_factory(wrapped_kargs) - - @wraps(f) - def wrapped_f(*args, **kargs): - args, kargs = wrap_nested_structure(*args, **kargs) - # to ensure the args passing to the final calling of f can be nested, - # in case of deeper-order wrapper funcs de-wrap this nesting behavior - args = [ - wrap_func_with_nested_access(arg) if callable(arg) else arg - for arg in args - ] - kargs = { - k: (wrap_func_with_nested_access(arg) if callable(arg) else arg) - for (k, arg) in kargs.items() - } - return f(*args, **kargs) - - return wrapped_f
- - -
[docs]def nested_obj_factory(obj): - """ - Use nested classes to wrap the input object. - - :param obj: object to be nested. - :return: nested object - """ - if isinstance(obj, Dataset): - return NestedDataset(obj) - elif isinstance(obj, DatasetDict): - return NestedDatasetDict(obj) - elif isinstance(obj, dict): - return NestedQueryDict(obj) - elif isinstance(obj, LazyBatch): - obj.data = NestedQueryDict(obj.data) - return obj - elif isinstance(obj, list): - return [nested_obj_factory(item) for item in obj] - else: - return obj
- - -
[docs]class NestedQueryDict(dict): - """Enhanced dict for better usability.""" - - def __init__(self, *args, **kargs): - if len(args) == 1 and isinstance(args[0], Dataset): - # init from another DatasetDict instance - self.__dict__ = copy.copy(args[0].__dict__) - else: - # init from scratch - super().__init__(*args, **kargs) - - # batched sample, (k & v) are organized by list manner - for k, v in self.items(): - if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict): - self[k] = [NestedQueryDict(item) for item in v] - - def __getitem__(self, key): - return nested_query(self, key)
- - -
[docs]class NestedDatasetDict(DatasetDict): - """Enhanced HuggingFace-DatasetDict for better usability and efficiency.""" - - def __init__(self, *args, **kargs): - if len(args) == 1 and isinstance(args[0], Dataset): - # init from another DatasetDict instance - self.__dict__ = copy.copy(args[0].__dict__) - else: - # init from scratch - super().__init__(*args, **kargs) - - def __getitem__(self, key): - return nested_query(self, key) - -
[docs] def map(self, **args): - """Override the map func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" - if 'function' not in args or args['function'] is None: - args['function'] = lambda x: nested_obj_factory(x) - else: - args['function'] = wrap_func_with_nested_access(args['function']) - - return super().map(**args)
- - -
[docs]class NestedDataset(Dataset): - """Enhanced HuggingFace-Dataset for better usability and efficiency.""" - - def __init__(self, *args, **kargs): - if len(args) == 1 and isinstance(args[0], Dataset): - # init from another Dataset instance - self.__dict__ = copy.copy(args[0].__dict__) - else: - # init from scratch - super().__init__(*args, **kargs) - - self.need_to_cleanup_caches = not is_caching_enabled() - - def __getitem__(self, key): - if isinstance(key, str): - # to index columns by query as string name(s) - res = nested_query(self, key) - else: - # to index rows by query as integer index, slices, - # or iter of indices or bools - res = super().__getitem__(key) - return nested_obj_factory(res) - -
[docs] def map(self, *args, **kargs): - """Override the map func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" - if args: - args = list(args) - # the first positional para is function - if args[0] is None: - args[0] = lambda x: nested_obj_factory(x) - else: - args[0] = wrap_func_with_nested_access(args[0]) - called_func = args[0] - else: - if 'function' not in kargs or kargs['function'] is None: - kargs['function'] = lambda x: nested_obj_factory(x) - else: - kargs['function'] = wrap_func_with_nested_access( - kargs['function']) - called_func = kargs['function'] - - # For wrapped function, try to get its original unwrapped method - while hasattr(called_func, '__wrapped__'): - called_func = called_func.__wrapped__ - # Does the called function belong to a batched OP? - if inspect.ismethod(called_func) \ - and 'is_batched_op' in dir(called_func.__self__) \ - and callable(getattr(called_func.__self__, 'is_batched_op')) \ - and called_func.__self__.is_batched_op(): - kargs['batched'] = True - kargs['batch_size'] = 1 - - if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: - new_fingerprint = generate_fingerprint(self, *args, **kargs) - kargs['new_fingerprint'] = new_fingerprint - - if cache_utils.CACHE_COMPRESS: - decompress(self, - kargs['new_fingerprint'], - kargs['num_proc'] if 'num_proc' in kargs else 1) - - new_ds = NestedDataset(super().map(*args, **kargs)) - - if cache_utils.CACHE_COMPRESS: - compress(self, - new_ds, - kargs['num_proc'] if 'num_proc' in kargs else 1) - - if self.need_to_cleanup_caches: - new_ds.cleanup_cache_files() - - return new_ds
- -
[docs] def filter(self, *args, **kargs): - """Override the filter func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" - if args: - args = list(args) - # the first positional para is function - if args[0] is None: - args[0] = lambda x: nested_obj_factory(x) - else: - args[0] = wrap_func_with_nested_access(args[0]) - else: - if 'function' not in kargs or kargs['function'] is None: - kargs['function'] = lambda x: nested_obj_factory(x) - else: - kargs['function'] = wrap_func_with_nested_access( - kargs['function']) - - if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: - new_fingerprint = generate_fingerprint(self, *args, **kargs) - kargs['new_fingerprint'] = new_fingerprint - - # For filter, it involves a map and a filter operations, so the final - # cache files includes two sets with different fingerprint (before and - # after). So we need to decompress these two sets of compressed cache - # files - if cache_utils.CACHE_COMPRESS: - decompress(self, - [kargs['new_fingerprint'], self._fingerprint], - kargs['num_proc'] if 'num_proc' in kargs else 1) - - # Turn off the compression due to it invokes map actually in the filter - # function. For cache file changes, map: A -> B, filter: A -> A, B. If - # we compress the caches of map, ops after filter cannot find the cache - # files A. So we turn off the inner cache compression for filter. - # Same for cleaning up cache files. - with CompressionOff(): - prev_state = self.need_to_cleanup_caches - self.need_to_cleanup_caches = False - new_ds = NestedDataset(super().filter(*args, **kargs)) - self.need_to_cleanup_caches = prev_state - - if cache_utils.CACHE_COMPRESS: - compress(self, - new_ds, - kargs['num_proc'] if 'num_proc' in kargs else 1) - - if self.need_to_cleanup_caches: - new_ds.cleanup_cache_files() - - return new_ds
- -
[docs] def select(self, *args, **kargs): - """Override the select func, such that selected samples can be accessed - by nested manner.""" - return nested_obj_factory(super().select(*args, **kargs))
- -
[docs] @classmethod - def from_dict(cls, *args, **kargs): - """Override the from_dict func, which is called by most from_xx - constructors, such that the constructed dataset object is - NestedDataset.""" - return NestedDataset(super().from_dict(*args, **kargs))
- -
[docs] def add_column(self, *args, **kargs): - """Override the add column func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().add_column(*args, **kargs))
- -
[docs] def select_columns(self, *args, **kargs): - """Override the select columns func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().select_columns(*args, **kargs))
- -
[docs] def remove_columns(self, *args, **kargs): - """Override the remove columns func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().remove_columns(*args, **kargs))
- -
[docs] def cleanup_cache_files(self): - """Override the cleanup_cache_files func, clear raw and compressed - cache files.""" - cleanup_compressed_cache_files(self) - return super().cleanup_cache_files()
- - -
[docs]def nested_query(root_obj: Union[NestedDatasetDict, NestedDataset, - NestedQueryDict], key): - """ - Find item from a given object, by first checking flatten layer, then - checking nested layers. - - :param root_obj: the object - :param key: the stored item to be queried, e.g., "meta" or - "meta.date" - :return: - """ - subkeys = key.split('.') - - tmp = root_obj - for i in range(len(subkeys)): - try: - key_to_query = '.'.join(subkeys[i:len(subkeys)]) - if isinstance(tmp, - (NestedQueryDict, NestedDataset, NestedDatasetDict)): - # access field using base_class's func to avoid endless loop - res = super(type(tmp), tmp).__getitem__(key_to_query) - elif isinstance(tmp, list): - # NestedDataset may return multiple rows as list - res = [nested_query(item, key_to_query) for item in tmp] - else: - # NestedQueryDict may return single row - res = tmp[key_to_query] - if res is not None: - return res - except Exception as outer_get_error: - exist_in_dict = issubclass(type(tmp), dict) and \ - '.'.join(subkeys[i:i + 1]) in tmp - exist_in_dataset = issubclass(type(tmp), Dataset) and '.'.join( - subkeys[i:i + 1]) in tmp.features - if exist_in_dict or exist_in_dataset: - # dive into next level - tmp = nested_obj_factory(tmp['.'.join(subkeys[i:i + 1])]) - else: - logger.debug( - f'cannot find item given key={key} in dataset=' - f'{root_obj}. For the final caught outer-exception,' - f'type is: {type(outer_get_error)}, ' - f'info is: {outer_get_error}') - return None - - return None
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/executor.html b/_modules/data_juicer/core/executor.html deleted file mode 100644 index 89dce68e1..000000000 --- a/_modules/data_juicer/core/executor.html +++ /dev/null @@ -1,311 +0,0 @@ - - - - - - data_juicer.core.executor — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.executor

-import os
-from time import time
-
-from loguru import logger
-
-from data_juicer.config import init_configs
-from data_juicer.format.load import load_formatter
-from data_juicer.ops import (OPERATORS, Deduplicator, Filter, Mapper, Selector,
-                             load_ops)
-from data_juicer.utils import cache_utils
-from data_juicer.utils.ckpt_utils import CheckpointManager
-from data_juicer.utils.constant import Fields
-
-from .exporter import Exporter
-from .tracer import Tracer
-
-
-
[docs]class Executor: - """ - This Executor class is used to process a specific dataset. - - It will load the dataset and unify the format, then apply all the - ops in the config file in order and generate a processed dataset. - """ - - def __init__(self, cfg=None): - """ - Initialization method. - - :param cfg: optional config dict. - """ - self.cfg = init_configs() if cfg is None else cfg - - self.work_dir = self.cfg.work_dir - - self.ops = None - - # only enable it when using cache - if self.cfg.use_cache: - logger.info(f'Using cache compression method: ' - f'[{self.cfg.cache_compress}]') - cache_utils.CACHE_COMPRESS = self.cfg.cache_compress - - # setup formatter - logger.info('Setting up data formatter...') - self.formatter = load_formatter(self.cfg.dataset_path, - self.cfg.text_keys, self.cfg.suffixes, - self.cfg.add_suffix) - - # whether to use checkpoint mechanism. If it's true, Executor will - # check if there are existing checkpoints first and try to load the - # checkpoints. If the checkpoints are loaded successfully, ops that - # have been processed will be skipped. - self.process_list = self.cfg.process - if self.cfg.use_checkpoint: - logger.info('Preparing checkpoint manager...') - self.ckpt_dir = os.path.join(self.work_dir, 'ckpt') - self.ckpt_manager = CheckpointManager(self.ckpt_dir, - self.process_list, - self.cfg.np) - if self.ckpt_manager.ckpt_available: - logger.info('Found existed dataset checkpoint.') - self.process_list = self.ckpt_manager.get_left_process_list() - self.cfg.process = self.process_list - - # prepare exporter and check export path suffix - logger.info('Preparing exporter...') - self.exporter = Exporter(self.cfg.export_path, - self.cfg.export_shard_size, - self.cfg.export_in_parallel, - self.cfg.np) - - # setup tracer - self.open_tracer = self.cfg.open_tracer - if self.open_tracer: - logger.info('Preparing tracer...') - self.tracer = Tracer(self.work_dir, show_num=self.cfg.trace_num) - self.op_list_to_trace = self.cfg.op_list_to_trace - if len(self.cfg.op_list_to_trace) == 0: - logger.info('Trace for all ops.') - self.op_list_to_trace = set(OPERATORS.modules.keys()) - -
[docs] def run(self, load_data_np=None): - """ - Running the dataset process pipeline. - - :param load_data_np: number of workers when loading the dataset. - :return: processed dataset. - """ - # 1. format data - if self.cfg.use_checkpoint and self.ckpt_manager.ckpt_available: - logger.info('Loading dataset from checkpoint...') - dataset = self.ckpt_manager.load_ckpt() - else: - logger.info('Loading dataset from data formatter...') - if load_data_np is None: - load_data_np = self.cfg.np - dataset = self.formatter.load_dataset(load_data_np) - - # 2. extract processes - logger.info('Preparing process operators...') - self.process_list, self.ops = load_ops(self.cfg.process, - self.cfg.op_fusion) - - # 3. data process - # - If tracer is open, trace each op after it's processed - # - If checkpoint is open, clean the cache files after each process - logger.info('Processing data...') - start = time() - tstart = start - for op_cfg, op in zip(self.process_list, self.ops): - op_name, op_args = list(op_cfg.items())[0] - prev = dataset # record last dataset - try: - if isinstance(op, Mapper): - tmp = dataset.map(function=op.process, - num_proc=self.cfg.np, - desc=op_name + '_process') - if self.open_tracer and \ - op_name in self.op_list_to_trace: - if op.is_batched_op(): - self.tracer.trace_batch_mapper( - op_name, - dataset, - tmp, - op.text_key) - else: - self.tracer.trace_mapper(op_name, - dataset, - tmp, - op.text_key) - elif isinstance(op, Filter): - if Fields.stats not in dataset.features: - # TODO: - # this is a temp solution, - # only add stats when calling filter op - dataset = dataset.add_column(name=Fields.stats, - column=[{}] * - dataset.num_rows) - if self.cfg.use_checkpoint: - prev = dataset - dataset = dataset.map(op.compute_stats, - num_proc=self.cfg.np, - desc=op_name + '_compute_stats') - if self.cfg.use_checkpoint: - prev = dataset - tmp = dataset.filter(op.process, - num_proc=self.cfg.np, - desc=op_name + '_process') - if self.open_tracer and op_name in self.op_list_to_trace: - self.tracer.trace_filter(op_name, dataset, tmp) - elif isinstance(op, Selector): - tmp = op.process(dataset) - if self.open_tracer and op_name in self.op_list_to_trace: - self.tracer.trace_filter(op_name, dataset, tmp) - elif isinstance(op, Deduplicator): - dataset = dataset.map(op.compute_hash, - num_proc=self.cfg.np, - desc=op_name + '_compute_hash') - if self.cfg.use_checkpoint: - prev = dataset - tmp, dup_pairs = op.process( - dataset, self.tracer.show_num if self.open_tracer - and op_name in self.op_list_to_trace else 0) - if self.open_tracer and op_name in self.op_list_to_trace: - self.tracer.trace_deduplicator(op_name, dup_pairs) - else: - raise NotImplementedError - dataset = tmp - except: # noqa: E722 - logger.error(f'An error occurred during Op [{op_name}].') - import traceback - traceback.print_exc() - if self.cfg.use_checkpoint: - logger.info('Writing checkpoint of dataset processed by ' - 'last op...') - prev.cleanup_cache_files() - self.ckpt_manager.save_ckpt(prev) - exit(1) - - # clean up cache files and record processed ops - if self.cfg.use_checkpoint: - self.ckpt_manager.record(op_name, op_args) - - end = time() - logger.info(f'Op [{op_name}] Done in {"%.3f" % (end - start)}(s). ' - f'Left {len(dataset)} samples.') - start = end - tend = time() - logger.info(f'All Ops are done in {"%.3f" % (tend - tstart)}(s).') - - # 4. data export - logger.info('Exporting dataset to disk...') - try: - self.exporter.export(dataset) - except: # noqa: E722 - logger.error('An error occurred during exporting the processed ' - 'dataset.') - import traceback - traceback.print_exc() - if self.cfg.use_checkpoint: - logger.info('Writing checkpoint of dataset processed by ' - 'last op...') - dataset.cleanup_cache_files() - self.ckpt_manager.save_ckpt(dataset) - # compress the last dataset after exporting - if self.cfg.use_cache and self.cfg.cache_compress: - from data_juicer.utils.compress import compress - compress(dataset) - return dataset
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/exporter.html b/_modules/data_juicer/core/exporter.html deleted file mode 100644 index f0f335f16..000000000 --- a/_modules/data_juicer/core/exporter.html +++ /dev/null @@ -1,315 +0,0 @@ - - - - - - data_juicer.core.exporter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.exporter

-import os
-from multiprocessing import Pool
-
-from loguru import logger
-
-from data_juicer.utils.constant import Fields
-
-
-
[docs]class Exporter: - """The Exporter class is used to export a dataset to files of specific - format.""" - - KiB = 2**10 # 1024 - MiB = 2**20 # 1024*1024 - GiB = 2**30 # 1024*1024*1024 - TiB = 2**40 # 1024*1024*1024*1024 - - def __init__(self, - export_path, - export_shard_size=0, - export_in_parallel=True, - num_proc=1, - export_ds=True, - export_stats=True): - """ - Initialization method. - - :param export_path: the path to export datasets. - :param export_shard_size: the size of each shard of exported - dataset. In default, it's 0, which means export the dataset - to a single file. - :param num_proc: number of process to export the dataset. - :param export_ds: whether to export the dataset contents. - :param export_stats: whether to export the stats of dataset. - """ - self.export_path = export_path - self.export_shard_size = export_shard_size - self.export_in_parallel = export_in_parallel - self.export_ds = export_ds - self.export_stats = export_stats - self.suffix = self._get_suffix(export_path) - self.num_proc = num_proc - self.max_shard_size_str = '' - - # get the string format of shard size - if self.export_shard_size // Exporter.TiB: - self.max_shard_size_str = '%.2f TiB' % (self.export_shard_size / - Exporter.TiB) - elif self.export_shard_size // Exporter.GiB: - self.max_shard_size_str = '%.2f GiB' % (self.export_shard_size / - Exporter.GiB) - elif self.export_shard_size // Exporter.MiB: - self.max_shard_size_str = '%.2f MiB' % (self.export_shard_size / - Exporter.MiB) - elif self.export_shard_size // Exporter.KiB: - self.max_shard_size_str = '%.2f KiB' % (self.export_shard_size / - Exporter.KiB) - else: - self.max_shard_size_str = '%.2f Bytes' % (self.export_shard_size) - - # we recommend users to set a shard size between MiB and TiB. - if 0 < self.export_shard_size < Exporter.MiB: - logger.warning(f'The export_shard_size [{self.max_shard_size_str}]' - f' is less than 1MiB. If the result dataset is too ' - f'large, there might be too many shard files to ' - f'generate.') - if self.export_shard_size >= Exporter.TiB: - logger.warning(f'The export_shard_size [{self.max_shard_size_str}]' - f' is larger than 1TiB. It might generate large ' - f'single shard file and make loading and exporting ' - f'slower.') - - def _get_suffix(self, export_path): - """ - Get the suffix of export path and check if it's supported. - - We only support ["jsonl", "json", "parquet"] for now. - - :param export_path: the path to export datasets. - :return: the suffix of export_path. - """ - suffix = export_path.split('.')[-1].lower() - support_dict = self._router() - if suffix not in support_dict: - raise NotImplementedError(f'Suffix of export path [' - f'{export_path}] is not supported ' - f'for now. Only support ' - f'{list(support_dict.keys())}.') - return suffix - - def _export_impl(self, dataset, export_path, suffix, export_stats=True): - """ - Export a dataset to specific path. - - :param dataset: the dataset to export. - :param export_path: the path to export the dataset. - :param suffix: suffix of export path. - :param export_stats: whether to export stats of dataset. - :return: - """ - if self.export_ds: - # fetch the corresponding export method according to the suffix - export_method = Exporter._router()[suffix] - if self.export_shard_size <= 0: - # export the whole dataset into one single file. - logger.info('Export dataset into a single file...') - export_method( - dataset, - export_path, - num_proc=self.num_proc if self.export_in_parallel else 1) - else: - # compute the dataset size and number of shards to split - if dataset._indices is not None: - dataset_nbytes = dataset.data.nbytes * len( - dataset._indices) / len(dataset.data) - else: - dataset_nbytes = dataset.data.nbytes - num_shards = int(dataset_nbytes / self.export_shard_size) + 1 - num_shards = min(num_shards, len(dataset)) - - # split the dataset into multiple shards - logger.info(f'Split the dataset to export into {num_shards} ' - f'shards. Size of each shard <= ' - f'{self.max_shard_size_str}') - shards = [ - dataset.shard(num_shards=num_shards, - index=i, - contiguous=True) for i in range(num_shards) - ] - len_num = len(str(num_shards)) + 1 - num_fmt = f'%0{len_num}d' - - # regard the export path as a directory and set file names for - # each shard - dirname = os.path.dirname(os.path.abspath(self.export_path)) - basename = os.path.basename(self.export_path).split('.')[0] - os.makedirs(dirname, exist_ok=True) - filenames = [ - os.path.join( - dirname, f'{basename}-{num_fmt % index}-of-' - f'{num_fmt % num_shards}' - f'.{self.suffix}') for index in range(num_shards) - ] - - # export dataset into multiple shards using multiprocessing - logger.info(f'Start to exporting to {num_shards} shards.') - pool = Pool(self.num_proc) - for i in range(num_shards): - pool.apply_async(export_method, - args=( - shards[i], - filenames[i], - )) - pool.close() - pool.join() - - if Fields.stats in dataset.features and export_stats: - # export stats of datasets into a single file. - ds_stats = dataset.select_columns(Fields.stats) - stats_file = export_path.replace('.' + suffix, '_stats.jsonl') - Exporter.to_jsonl( - ds_stats, - stats_file, - num_proc=self.num_proc if self.export_in_parallel else 1 - ) - -
[docs] def export(self, dataset): - """ - Export method for a dataset. - - :param dataset: the dataset to export. - :return: - """ - self._export_impl(dataset, self.export_path, self.suffix, - self.export_stats)
- -
[docs] @staticmethod - def to_jsonl(dataset, export_path, num_proc=1, **kwargs): - """ - Export method for json/jsonl target files. - - :param dataset: the dataset to export. - :param export_path: the path to store the exported dataset. - :param num_proc: the number of processes used to export the dataset. - :param kwargs: extra arguments. - :return: - """ - dataset.to_json(export_path, force_ascii=False, num_proc=num_proc)
- -
[docs] @staticmethod - def to_parquet(dataset, export_path, **kwargs): - """ - Export method for parquet target files. - - :param dataset: the dataset to export. - :param export_path: the path to store the exported dataset. - :param kwargs: extra arguments. - :return: - """ - dataset.to_parquet(export_path)
- - # suffix to export method - @staticmethod - def _router(): - """ - A router from different suffixes to corresponding export methods. - - :return: A dict router. - """ - return { - 'jsonl': Exporter.to_jsonl, - 'json': Exporter.to_jsonl, - 'parquet': Exporter.to_parquet, - }
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/tracer.html b/_modules/data_juicer/core/tracer.html deleted file mode 100644 index 9a50e56fe..000000000 --- a/_modules/data_juicer/core/tracer.html +++ /dev/null @@ -1,325 +0,0 @@ - - - - - - data_juicer.core.tracer — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.tracer

-import os
-
-import pandas as pd
-from datasets import Dataset
-from loguru import logger
-
-
-
[docs]class Tracer: - """ - The tracer to trace the sample changes before and after an operator - process. - - The comparison results will be stored in the work directory. - """ - - def __init__(self, work_dir, show_num=10): - """ - Initialization method. - - :param work_dir: the work directory to store the comparison - results - :param show_num: the maximum number of samples to show in the - comparison result files. - """ - self.work_dir = os.path.join(work_dir, 'trace') - if not os.path.exists(self.work_dir): - os.makedirs(self.work_dir) - self.show_num = show_num - -
[docs] def trace_mapper(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset, text_key: str): - """ - Compare datasets before and after a Mapper. - - This will mainly show the different sample pairs due to the - modification by the Mapper - - :param op_name: the op name of mapper - :param previous_ds: dataset before the mapper process - :param processed_ds: dataset processed by the mapper - :param text_key: which text_key to trace - :return: - """ - assert len(previous_ds) == len(processed_ds) - dif_dict = [] - num = 0 - - # Find different samples orderly between previous and processed - # datasets until the total number of found sample pairs is enough. - for i in range(len(previous_ds)): - previous_sample = previous_ds[i][text_key] - processed_sample = processed_ds[i][text_key] - if previous_sample != processed_sample: - dif_dict.append({ - 'original text': previous_sample, - 'processed_text': processed_sample, - }) - num += 1 - if num >= self.show_num: - break - - if len(dif_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(dif_dict) < self.show_num: - logger.warning(f'There are {len(dif_dict)} different samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'mapper-{op_name}.jsonl' - dif_df = pd.DataFrame(dif_dict) - dif_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_batch_mapper(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset, text_key: str): - """ - Compare datasets before and after a BatchMapper. - - This will mainly show the new samples augmented by the BatchMapper - - :param op_name: the op name of mapper - :param previous_ds: dataset before the mapper process - :param processed_ds: dataset processed by the mapper - :param text_key: which text_key to trace - :return: - """ - assert previous_ds[0][text_key] == processed_ds[0][text_key] - aug_dict = [] - - # Get the first samples - for i in range(len(processed_ds)): - processed_sample = processed_ds[i] - aug_dict.append(processed_sample) - if i + 1 >= self.show_num: - break - - if len(aug_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are ' - f'empty. Thus no comparison results would be ' - f'generated.') - return - elif len(aug_dict) < self.show_num: - logger.warning(f'There are only {len(aug_dict)} samples -- less ' - f'than expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'mapper-{op_name}.jsonl' - dif_df = pd.DataFrame(aug_dict) - dif_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_filter(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset): - """ - Compare datasets before and after a Filter. - - This will mainly show the filtered samples by the Filter - - :param op_name: the op name of filter - :param previous_ds: dataset before the filter process - :param processed_ds: dataset processed by the filter - :return: - """ - if len(previous_ds) == len(processed_ds): - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - - # get the number of filtered samples. - total_dif_num = len(previous_ds) - len(processed_ds) - # index of the current sample in the previous dataset - i = 0 - filter_dict = [] - # number of found filtered samples. It's the offset bewteen two - # datasets as well. - num = 0 - while i < len(previous_ds): - if i - num >= len(processed_ds) or \ - previous_ds[i] != processed_ds[i - num]: - # 1. If all samples in processed dataset are checked but there - # still some samples left in the previous dataset, all of these - # left samples are filtered. - # 2. If the corresponding samples in previous and processed - # datasets are different, samples in the previous dataset are - # filtered. - num += 1 - filter_dict.append(previous_ds[i]) - if num >= self.show_num or num >= total_dif_num: - # If the total number of found filtered samples is enough or we - # have found all filtered samples, just stop. - break - i += 1 - if len(filter_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(filter_dict) < self.show_num: - logger.warning(f'There are {len(filter_dict)} filtered samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'filter-{op_name}.jsonl' - filter_df = pd.DataFrame(filter_dict) - filter_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_deduplicator(self, op_name: str, dup_pairs: list): - """ - Compare datasets before and after a Deduplicator. - - This will mainly show the near-duplicate sample pairs extracted - by the Deduplicator. Different from the other two trace methods, - the trace process for deduplicator is embedded into the process - method of deduplicator, but the other two trace methods are - independent of the process method of mapper and filter operators - - :param op_name: the op name of deduplicator - :param dup_pairs: duplicate sample pairs obtained from - deduplicator - :return: - """ - if dup_pairs is None: - logger.warning(f'Op [{op_name}] does not generate dup_pairs ' - f'correctly, thus no comparison results can be ' - f'obtained from this op.') - return - if len(dup_pairs) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(dup_pairs) < self.show_num: - logger.warning(f'There are {len(dup_pairs)} filtered samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # reorganize the duplicate pairs - dup_dict = [] - for key in dup_pairs: - dup_dict.append({ - 'dup1': dup_pairs[key][0], - 'dup2': dup_pairs[key][1], - }) - - # export the tracer result. - res_name = f'duplicate-{op_name}.jsonl' - dup_df = pd.DataFrame(dup_dict) - dup_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/csv_formatter.html b/_modules/data_juicer/format/csv_formatter.html deleted file mode 100644 index 47d27d095..000000000 --- a/_modules/data_juicer/format/csv_formatter.html +++ /dev/null @@ -1,127 +0,0 @@ - - - - - - data_juicer.format.csv_formatter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.format.csv_formatter

-from .formatter import FORMATTERS, LocalFormatter
-
-
-
[docs]@FORMATTERS.register_module() -class CsvFormatter(LocalFormatter): - """ - The class is used to load and format csv-type files. - - Default suffixes is `['.csv']` - """ - SUFFIXES = ['.csv'] - - def __init__(self, dataset_path, suffixes=None, **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param kwargs: extra args - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='csv', - **kwargs, - )
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/formatter.html b/_modules/data_juicer/format/formatter.html deleted file mode 100644 index ee3fe449d..000000000 --- a/_modules/data_juicer/format/formatter.html +++ /dev/null @@ -1,366 +0,0 @@ - - - - - - data_juicer.format.formatter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.formatter

-import os
-from typing import List, Tuple, Union
-
-from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
-from loguru import logger
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import (find_files_with_suffix,
-                                          is_absolute_path)
-from data_juicer.utils.registry import Registry
-
-FORMATTERS = Registry('Formatters')
-
-
-
[docs]class BaseFormatter: - """Base class to load dataset.""" - -
[docs] def load_dataset(self, *args) -> Dataset: - raise NotImplementedError
- - -
[docs]class LocalFormatter(BaseFormatter): - """The class is used to load a dataset from local files or local - directory.""" - - def __init__( - self, - dataset_path: str, - type: str, - suffixes: Union[str, List[str], Tuple[str]] = None, - text_keys: List[str] = None, - add_suffix=False, - **kwargs, - ): - """ - Initialization method. - - :param dataset_path: path to a dataset file or a dataset - directory - :param type: a packaged dataset module type (json, csv, etc.) - :param suffixes: files with specified suffixes to be processed - :param text_keys: key names of field that stores sample - text. - :param add_suffix: whether to add the file suffix to dataset - meta info - :param kwargs: extra args - """ - self.type = type - self.kwargs = kwargs - self.text_keys = text_keys - self.data_files = find_files_with_suffix(dataset_path, suffixes) - self.add_suffix = add_suffix - -
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset: - """ - Load a dataset from dataset file or dataset directory, and unify its - format. - - :param num_proc: number of processes when loading the dataset - :param global_cfg: global cfg used in consequent processes, - :return: formatted dataset - """ - datasets = load_dataset(self.type, - data_files={ - key.strip('.'): self.data_files[key] - for key in self.data_files - }, - num_proc=num_proc, - **self.kwargs) - if self.add_suffix: - logger.info('Add suffix info into dataset...') - datasets = add_suffixes(datasets) - else: - from data_juicer.core.data import NestedDataset - datasets = NestedDataset( - concatenate_datasets([ds for _, ds in datasets.items()])) - ds = unify_format(datasets, - text_keys=self.text_keys, - num_proc=num_proc) - return ds
- - -
[docs]class RemoteFormatter(BaseFormatter): - """The class is used to load a dataset from repository of huggingface - hub.""" - - def __init__(self, - dataset_path: str, - text_keys: List[str] = None, - **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param text_keys: key names of field that stores sample - text. - :param kwargs: extra args - """ - self.path = dataset_path - self.text_keys = text_keys - self.kwargs = kwargs - -
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset: - """ - Load a dataset from HuggingFace, and unify its format. - - :param num_proc: number of processes when loading the dataset - :param global_cfg: the global cfg used in consequent processes, - :return: formatted dataset - """ - ds = load_dataset(self.path, - split='train', - num_proc=num_proc, - **self.kwargs) - ds = unify_format(ds, text_keys=self.text_keys, num_proc=num_proc) - return ds
- - -
[docs]def add_suffixes(datasets: DatasetDict) -> Dataset: - """ - Add suffix filed to datasets. - - :param datasets: a DatasetDict object - :return: datasets with suffix features. - """ - logger.info('Add suffix column for dataset') - for key, ds in datasets.items(): - if Fields.suffix not in ds.features: - datasets[key] = ds.add_column(name=Fields.suffix, - column=['.' + key] * ds.num_rows) - datasets = concatenate_datasets([ds for _, ds in datasets.items()]) - from data_juicer.core.data import NestedDataset - return NestedDataset(datasets)
- - -
[docs]def unify_format( - dataset: Dataset, - text_keys: Union[List[str], str] = 'text', - num_proc: int = 1, -) -> Dataset: - """ - Get an unified internal format, conduct the following modifications. - - 1. check keys of dataset - - 2. filter out those samples with empty or None text - - :param dataset: input dataset - :param text_keys: original text key(s) of dataset. - :param num_proc: number of processes for mapping - :param global_cfg: the global cfg used in consequent processes, - since cfg.text_key may be modified after unifying - - :return: unified_format_dataset - """ - from data_juicer.core.data import NestedDataset - if isinstance(dataset, DatasetDict): - datasets = list(dataset.values()) - assert len(datasets) == 1, 'Please make sure the passed datasets ' \ - 'contains only 1 dataset' - dataset = datasets[0] - assert isinstance(dataset, Dataset) or \ - isinstance(dataset, NestedDataset), \ - 'Currently we only support processing data' \ - 'with huggingface-Dataset format' - - if text_keys is None: - text_keys = [] - - if isinstance(text_keys, str): - text_keys = [text_keys] - - logger.info('Unifying the input dataset formats...') - - dataset = NestedDataset(dataset) - - # 1. check text related keys - for key in text_keys: - if key not in dataset.features: - err_msg = f'There is no key [{key}] in dataset. You might set ' \ - f'wrong text_key in the config file for your dataset. ' \ - f'Please check and retry!' - logger.error(err_msg) - raise ValueError(err_msg) - - # 2. filter out those samples with empty or None text - # TODO: optimize the filtering operation for better efficiency - logger.info(f'There are {len(dataset)} sample(s) in the original dataset.') - - def non_empty_text(sample, target_keys): - for target_key in target_keys: - # TODO: case for CFT, in which the len(sample[target_key]) == 0 - if sample[target_key] is None: - # we filter out the samples contains at least None column - # since the op can not handle it now - return False - return True - - dataset = dataset.filter(non_empty_text, - num_proc=num_proc, - fn_kwargs={'target_keys': text_keys}) - logger.info(f'{len(dataset)} samples left after filtering empty text.') - - # 3. add Fields.stats field - # TODO: - # this is a temp solution, - # it will occur errors when only call mapper ops - # dataset = dataset.add_column( \ - # name=Fields.stats, column=[{}] * dataset.num_rows) - - return dataset
- - -
[docs]def load_formatter(dataset_path, - text_keys=None, - suffixes=None, - add_suffix=False, - **kwargs) -> BaseFormatter: - """ - Load the appropriate formatter for different types of data formats. - - :param dataset_path: Path to dataset file or dataset directory - :param text_keys: key names of field that stores sample text. - Default: None - :param suffixes: the suffix of files that will be read. Default: - None - :return: a dataset formatter. - """ - - if suffixes is None: - suffixes = [] - ext_num = {} - if os.path.isdir(dataset_path) or os.path.isfile(dataset_path): - file_dict = find_files_with_suffix(dataset_path, suffixes) - if not file_dict: - raise IOError( - 'Unable to find files matching the suffix from {}'.format( - dataset_path)) - for ext in file_dict: - ext_num[ext] = len(file_dict[ext]) - - # local dataset - if ext_num: - formatter_num = {} - for name, formatter in FORMATTERS.modules.items(): - formatter_num[name] = 0 - for ext in ext_num: - if ext in formatter.SUFFIXES: - formatter_num[name] += ext_num[ext] - formatter = max(formatter_num, key=lambda x: formatter_num[x]) - target_suffixes = set(ext_num.keys()).intersection( - set(FORMATTERS.modules[formatter].SUFFIXES)) - return FORMATTERS.modules[formatter](dataset_path, - text_keys=text_keys, - suffixes=target_suffixes, - add_suffix=add_suffix, - **kwargs) - - # try huggingface dataset hub - elif not is_absolute_path(dataset_path) and dataset_path.count('/') <= 1: - return RemoteFormatter(dataset_path, text_keys=text_keys, **kwargs) - - # no data - else: - raise NotImplementedError
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/json_formatter.html b/_modules/data_juicer/format/json_formatter.html deleted file mode 100644 index 5452c4dc5..000000000 --- a/_modules/data_juicer/format/json_formatter.html +++ /dev/null @@ -1,127 +0,0 @@ - - - - - - data_juicer.format.json_formatter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.format.json_formatter

-from .formatter import FORMATTERS, LocalFormatter
-
-
-
[docs]@FORMATTERS.register_module() -class JsonFormatter(LocalFormatter): - """ - The class is used to load and format json-type files. - - Default suffixes is `['.json', '.jsonl', '.jsonl.zst']` - """ - SUFFIXES = ['.json', '.jsonl', '.jsonl.zst'] - - def __init__(self, dataset_path, suffixes=None, **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param kwargs: extra args - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='json', - **kwargs, - )
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/load.html b/_modules/data_juicer/format/load.html deleted file mode 100644 index b00d988cf..000000000 --- a/_modules/data_juicer/format/load.html +++ /dev/null @@ -1,128 +0,0 @@ - - - - - - data_juicer.format.load — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.load

-from .formatter import BaseFormatter
-from .mixture_formatter import MixtureFormatter
-
-
-
[docs]def load_formatter(dataset_path, - text_keys=None, - suffixes=[], - add_suffix=False, - **kwargs) -> BaseFormatter: - """ - Load mixture formatter for multiple different data formats with an optional - weight(default 1.0) according to their formats. - - :param dataset_path: path to a dataset file or a dataset directory - :param text_keys: key names of field that stores sample text. - Default: None - :param suffixes: files with specified suffixes to be processed. - :param add_suffix: whether to add the file suffix to dataset meta - info - :return: a dataset formatter. - """ - formatter = MixtureFormatter(dataset_path=dataset_path, - text_keys=text_keys, - suffixes=suffixes, - add_suffix=add_suffix, - **kwargs) - return formatter
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/mixture_formatter.html b/_modules/data_juicer/format/mixture_formatter.html deleted file mode 100644 index 477423a08..000000000 --- a/_modules/data_juicer/format/mixture_formatter.html +++ /dev/null @@ -1,203 +0,0 @@ - - - - - - data_juicer.format.mixture_formatter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.format.mixture_formatter

-from typing import List, Tuple, Union
-
-import numpy as np
-from datasets import Dataset, concatenate_datasets
-from loguru import logger
-
-from .formatter import BaseFormatter, load_formatter
-
-
-
[docs]class MixtureFormatter(BaseFormatter): - """The class mixes multiple datasets by randomly selecting samples from - every dataset and merging them, and then exports the merged datasset as a - new mixed dataset.""" - - def __init__(self, - dataset_path: str, - suffixes: Union[str, List[str], Tuple[str]] = None, - text_keys=None, - add_suffix=False, - **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset dir or a list - of them, optional weights, default 1.0 e.g. `<w1> ds.jsonl - <w2> ds_dir <w3> ds_file.json` - :param suffixes: files with specified suffixes to be processed - :param text_keys: key names of field that stores sample text. - :param add_suffix: whether to add the file suffix to dataset - meta info - :param kwargs: extra args - """ - data_prefixes, weights = self._get_weight(data_prefix=dataset_path) - self.weights = weights - self.formatters = [ - load_formatter(dataset_path=data_prefix, - suffixes=suffixes, - text_keys=text_keys, - add_suffix=add_suffix, - **kwargs) for data_prefix in data_prefixes - ] - - def _get_weight(self, data_prefix): - """ - Split every dataset path and its weight. - - :param data_prefix: a dataset file or a dataset dir or a list of - them, e.g. `<w1> ds1.jsonl <w2> ds2_dir <w3> ds3_file.json` - :return: list of dataset path and list of weights - """ - data_prefix = data_prefix.split() - weights = [] - prefixes = [] - - for i in range(len(data_prefix)): - try: - value = float(data_prefix[i]) - weights.append(value) - except: # noqa: E722 - value = data_prefix[i].strip() - - # if not set weight, use 1.0 as default - if i == 0 or len(weights) == len(prefixes): - weights.append(1.0) - prefixes.append(value) - return prefixes, weights - - def _random_sample(self, dataset, weight=1.0, seed=None): - """ - Randomly sample a subset from a dataset with weight. - :param dataset: a HuggingFace dataset - :param weight: sample ratio of dataset - :param seed: random sample seed, if None, 42 as default - :return: a subset of dataset - """ - if seed is None: - seed = 42 - num_samples = min(int(np.ceil(dataset.num_rows * weight)), - dataset.num_rows) - if num_samples == dataset.num_rows: - return dataset - return dataset.shuffle(seed=seed).select(range(num_samples)) - -
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset: - """ - Load a mixed dataset. - - :param num_proc: number of processes when loading the dataset - :return: mixed dataset - """ - dataset_list = [] - for weight, formatter in zip(self.weights, self.formatters): - dataset = formatter.load_dataset(num_proc) - sampled = self._random_sample(dataset, weight) - logger.info(f'sampled {len(sampled)} from ' - f'{len(dataset)} with weight {weight}') - dataset_list.append(sampled) - - from data_juicer.core.data import NestedDataset - mixed_dataset = NestedDataset(concatenate_datasets(dataset_list)) - logger.info(f'There are {len(mixed_dataset)} in final dataset') - return mixed_dataset
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/parquet_formatter.html b/_modules/data_juicer/format/parquet_formatter.html deleted file mode 100644 index 1b542d4e0..000000000 --- a/_modules/data_juicer/format/parquet_formatter.html +++ /dev/null @@ -1,127 +0,0 @@ - - - - - - data_juicer.format.parquet_formatter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.format.parquet_formatter

-from .formatter import FORMATTERS, LocalFormatter
-
-
-
[docs]@FORMATTERS.register_module() -class ParquetFormatter(LocalFormatter): - """ - The class is used to load and format parquet-type files. - - Default suffixes is `['.parquet']` - """ - SUFFIXES = ['.parquet'] - - def __init__(self, dataset_path, suffixes=None, **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param kwargs: extra args - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='parquet', - **kwargs, - )
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/text_formatter.html b/_modules/data_juicer/format/text_formatter.html deleted file mode 100644 index 630945540..000000000 --- a/_modules/data_juicer/format/text_formatter.html +++ /dev/null @@ -1,258 +0,0 @@ - - - - - - data_juicer.format.text_formatter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.format.text_formatter

-import os
-from multiprocessing import Pool
-
-import pdfplumber
-from datasets import Dataset, concatenate_datasets, load_dataset
-from docx import Document
-from loguru import logger
-
-from data_juicer.utils.cache_utils import DATA_JUICER_CACHE_HOME
-from data_juicer.utils.file_utils import find_files_with_suffix
-
-from .formatter import FORMATTERS, LocalFormatter, add_suffixes, unify_format
-
-
-
[docs]def extract_txt_from_docx(fn, tgt_path): - """ - Extract text from a docx file and save to target path. - - :param fn: path to input pdf file - :param tgt_path: path to save text file. - """ - doc = Document(fn) - text = [para.text for para in doc.paragraphs if para.text.strip()] - base_fn = os.path.basename(fn).lower().replace('.docx', '.txt') - with open(os.path.join(tgt_path, base_fn), 'w') as f: - f.write('\n'.join(text))
- - -
[docs]def extract_txt_from_pdf(fn, tgt_path): - """ - Extract text from a pdf file and save to target path. - - :param fn: path to input pdf file - :param tgt_path: path to save text file. - """ - with pdfplumber.open(fn) as pdf: - text = [] - for page in pdf.pages: - # remove tables from each page extracted by pdfplumber - tables = page.find_tables() - for table in tables: - page = page.outside_bbox(table.bbox) - # remove page number from the end of each page - page_text = page.extract_text() - page_num = str(page.page_number) - if page_text.rstrip().endswith(page_num): - page_text = page_text.rstrip()[:-len(page_num)] - if page_text.strip(): - text.append(page_text) - base_fn = os.path.basename(fn).lower().replace('.pdf', '.txt') - with open(os.path.join(tgt_path, base_fn), 'w') as f: - f.write('\n'.join(text))
- - -
[docs]@FORMATTERS.register_module() -class TextFormatter(LocalFormatter): - """ - The class is used to load and format text-type files. - - e.g. `['.txt', '.pdf', '.cpp', '.docx']` - """ - - SUFFIXES = [ - '.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', - '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', - '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', - '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', - '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', - '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', - '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', - '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', - '.m', '.smali' - ] - - def __init__(self, - dataset_path, - suffixes=None, - add_suffix=False, - **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param add_suffix: Whether to add file suffix to datase meta - info - :param kwargs: extra args - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='text', - add_suffix=add_suffix, - **kwargs, - ) - self.dataset_path = dataset_path - self.add_suffix = add_suffix - -
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset: - """ - Load a dataset from local text-type files. - - :param num_proc: number of processes when loading the dataset - :return: unified_format_dataset. - """ - # extract text to cache directory - extracted_dataset_path = os.path.join( - DATA_JUICER_CACHE_HOME, - os.path.basename(os.path.abspath(self.dataset_path))) - - for file_type in self.data_files: - - # extract text from docx or pdf files, and save as txt type - if file_type == '.docx' or file_type == '.pdf': - extracted_filetype_path = os.path.join(extracted_dataset_path, - file_type.strip('.')) - if not os.path.exists(extracted_filetype_path): - os.makedirs(extracted_filetype_path) - logger.info('Extracting text from {} files...'.format( - file_type.strip('.'))) - - extract_func = extract_txt_from_docx \ - if file_type == '.docx' else extract_txt_from_pdf - pool = Pool(num_proc) - for data_file in self.data_files[file_type]: - pool.apply_async(func=extract_func, - args=( - data_file, - extracted_filetype_path, - )) - pool.close() - pool.join() - logger.info(f'Extracted text files are stored in directory ' - f'{extracted_filetype_path}') - - # look for extracted txt files - self.data_files[file_type] = find_files_with_suffix( - extracted_filetype_path, '.txt')['.txt'] - - # load text dataset, one text file as one sample - datasets = load_dataset('text', - data_files={ - key.strip('.'): self.data_files[key] - for key in self.data_files - }, - sample_by='document', - num_proc=num_proc, - **self.kwargs) - # whether to add file suffix to datase meta info - if self.add_suffix: - logger.info('Add suffix info into dataset...') - datasets = add_suffixes(datasets) - else: - datasets = concatenate_datasets([ds for _, ds in datasets.items()]) - return unify_format(datasets, - text_keys=self.text_keys, - num_proc=num_proc)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/tsv_formatter.html b/_modules/data_juicer/format/tsv_formatter.html deleted file mode 100644 index 4b0169428..000000000 --- a/_modules/data_juicer/format/tsv_formatter.html +++ /dev/null @@ -1,128 +0,0 @@ - - - - - - data_juicer.format.tsv_formatter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.format.tsv_formatter

-from .formatter import FORMATTERS, LocalFormatter
-
-
-
[docs]@FORMATTERS.register_module() -class TsvFormatter(LocalFormatter): - """ - The class is used to load and format tsv-type files. - - Default suffixes is `['.tsv']` - """ - SUFFIXES = ['.tsv'] - - def __init__(self, dataset_path, suffixes=None, **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param kwargs: extra args, e.g. `delimiter = ','` - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='csv', - delimiter='\t', - **kwargs, - )
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html deleted file mode 100644 index 3df4706c7..000000000 --- a/_modules/data_juicer/ops/base_op.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - data_juicer.ops.base_op — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.base_op

-from data_juicer.utils.registry import Registry
-
-OPERATORS = Registry('Operators')
-
-
-
[docs]class Mapper: - - def __init__(self, text_key: str = None): - """ - Base class that conducts text editing. - - :param text_key: the key name of field that stores sample texts - to be processed. - """ - if text_key is None: - text_key = 'text' - self.text_key = text_key - from data_juicer.core.data import wrap_func_with_nested_access - self.process = wrap_func_with_nested_access(self.process) - - # In default, it's a normal OP instead of batched OP - self._batched_op = False - -
[docs] def process(self, sample): - """ - For sample level, sample --> sample - - :param sample: sample to process - :return: processed sample - """ - raise NotImplementedError
- -
[docs] def is_batched_op(self): - return self._batched_op
- - -
[docs]class Filter: - - def __init__(self, text_key: str = None): - """ - Base class that removes specific info. - - :param text_key: the key name of field that stores sample texts - to be processed - """ - if text_key is None: - text_key = 'text' - self.text_key = text_key - from data_juicer.core.data import wrap_func_with_nested_access - self.process = wrap_func_with_nested_access(self.process) - self.compute_stats = wrap_func_with_nested_access(self.compute_stats) - -
[docs] def compute_stats(self, sample, context=False): - """ - Compute stats for the sample which is used as a metric to decide - whether to filter this sample. - - :param sample: input sample. - :param context: whether to store context information of intermediate - vars in the sample temporarily. - :return: sample with computed stats - """ - raise NotImplementedError
- -
[docs] def process(self, sample): - """ - For sample level, sample --> Boolean. - - :param sample: sample to decide whether to filter - :return: true for keeping and false for filtering - """ - raise NotImplementedError
- - -
[docs]class Deduplicator: - - def __init__(self, text_key: str = None): - """ - Base class that conducts deduplication. - - :param text_key: the key name of field that stores sample texts - to be processed - """ - if text_key is None: - text_key = 'text' - self.text_key = text_key - from data_juicer.core.data import wrap_func_with_nested_access - self.process = wrap_func_with_nested_access(self.process) - self.compute_hash = wrap_func_with_nested_access(self.compute_hash) - -
[docs] def compute_hash(self, sample): - """ - Compute hash values for the sample. - - :param sample: input sample - :return: sample with computed hash value. - """ - raise NotImplementedError
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - raise NotImplementedError
- - -
[docs]class Selector: - - def __init__(self, text_key: str = None): - """ - Base class that conducts selection in dataset-level. - - :param text_key: the key name of field that stores sample texts - to be processed - """ - if text_key is None: - text_key = 'text' - self.text_key = text_key - from data_juicer.core.data import wrap_func_with_nested_access - self.process = wrap_func_with_nested_access(self.process) - -
[docs] def process(self, dataset): - """ - Dataset --> dataset. - - :param dataset: input dataset - :return: selected dataset. - """ - raise NotImplementedError
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/common/helper_func.html b/_modules/data_juicer/ops/common/helper_func.html deleted file mode 100644 index 766ed18ea..000000000 --- a/_modules/data_juicer/ops/common/helper_func.html +++ /dev/null @@ -1,298 +0,0 @@ - - - - - - data_juicer.ops.common.helper_func — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.common.helper_func

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-from typing import Dict
-
-import regex as re
-
-
-
[docs]class UnionFind: - - def __init__(self): - """Initialization method.""" - self.parent: Dict[int, int] = {} - -
[docs] def find(self, x): - if x not in self.parent: - self.parent[x] = x - if self.parent[x] != x: - self.parent[x] = self.find(self.parent[x]) - return self.parent[x]
- -
[docs] def union(self, x, y): - px = self.find(x) - py = self.find(y) - self.parent[px] = self.parent[py] = min(px, py)
- - -
[docs]def strip(document, strip_characters): - """ - Way faster than document.strip(strip_characters) since strip_characters is - now a set instead of a str, and it contains a lot of elements (all the - emojis). - - :param document: document to be processed - :param strip_characters: characters uesd for stripping document - :return: stripped document - """ - if not document: - return document - beg_ind = 0 - end_ind = len(document) - for i in range(len(document)): - if document[i] in strip_characters: - beg_ind += 1 - else: - break - for i in range(1, len(document) + 1): - if document[-i] in strip_characters: - end_ind -= 1 - else: - break - document_stripped = document[beg_ind:end_ind] - return document_stripped
- - -
[docs]def split_on_whitespace(document, new_line=False, tab=False): - """ - This method also removes concatenated spaces. - - :param document: document to be splited - :param new_line: whether to split document with '\\\\n' - :param tag: whether to split document with '\\\\t' - :return: word list obtained after splitting document - """ - sep = [' '] + new_line * ['\n'] + tab * ['\t'] - sep = '|'.join(sep) - split_document = re.split(sep, document) - split_document = [word for word in split_document if word] - return split_document
- - -
[docs]def split_on_newline_tab_whitespace(document): - """ - This method is used to split the document into different levels of sub- - sentences. - - First split on "\\\\n", then on "\\\\t", then on " ". - :param document: document to be splited - :return: setence list obtained after splitting document - """ - sentences = document.split('\n') - sentences = [sentence.split('\t') for sentence in sentences] - sentences = [[ - split_on_whitespace(subsentence) for subsentence in sentence - ] for sentence in sentences] - return sentences
- - -
[docs]def merge_on_whitespace_tab_newline(sentences): - """ - This method is used to merge different levels of sub-sentences into one - document. Invert the method split_on_newline_tab_whitespace. Removes - concatenated separators. - - :param sentences: sentence list to be merged - :return: document obtained after merging sub-sentences - """ - sentences = [[ - ' '.join(subsentence) for subsentence in sentence if subsentence - ] for sentence in sentences] - sentences = ['\t'.join(sentence) for sentence in sentences if sentence] - if not sentences: - return '' - document = '\n'.join(sentences) - return document
- - -
[docs]def words_augmentation(words, group_size, join_char): - """ - Augment words, especially for Chinese (without a space between words) and - Vietnamese (with a space between syllables). - - :param word: word list to be augmented - :param group_size: the size of word groups that need to be merged - :param join_char: characters to be added between word group - :return: word list after augment - """ - augmentation = [ - join_char.join(words[i:i + group_size]) - for i in range(len(words) - group_size + 1) - ] - return augmentation
- - -
[docs]def get_words_from_document(document, - token_func=None, - new_line=True, - tab=True,): - """ - Get words from a document. Useful to compute ratios, like the - stopwords ratio. - - :param document: document that need to split words - :param token_func: function of tokenizer, if specified, the function - will be used for split document into different tokens. - :param new_line: whether to use `\\\\n' to split words - :param tab: whether to use '\\\\t' to split words - :return: word list obtained from document - """ - if token_func: - words = token_func(document) - else: - words = split_on_whitespace(document, new_line, tab) - return words
- -
[docs]def words_refinement(words, - lower_case=False, - strip_chars=None, - use_words_aug=False, - words_aug_group_sizes=[2], - words_aug_join_char=''): - """ - Refine split words. Non reversible since the document is split on - multiple characters, words are stripped of special characters and - characters are converted to lower case. - - :param words: the word list to be augmented - :param lower_case: whether to convert word to lowercase - :param strip_chars: chars that need to be stripped in words - :param use_words_aug: whether to use word augmentation - :param words_aug_group_sizes: the size of word groups that need to - be merged - :param words_aug_join_char: characters to be added between word - group - :return: refined words or word list - """ - - if lower_case: - words = [word.lower() for word in words] - if strip_chars: - words = [strip(word, strip_chars) for word in words] - words = [word for word in words if word] - if use_words_aug: - augmentation = [ - words_augmentation(words, group_size, words_aug_join_char) - for group_size in words_aug_group_sizes - ] - augmentation = [word for augm in augmentation for word in augm] - words = words + augmentation - return words
- - -
[docs]def get_sentences_from_document(document, model_func=None): - """ - Get sentences from a document. - - :param document: document that need to split sentences - :param model_func: function of sentence model, if specified, the - function will be used for spliting document into different - sentences. - :return: document with the sentences separated by '\\\\n' - """ - if model_func: - sentences = model_func(document) - else: - sentences = document.splitlines() - return '\n'.join(sentences)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html deleted file mode 100644 index b0b0eca96..000000000 --- a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html +++ /dev/null @@ -1,213 +0,0 @@ - - - - - - data_juicer.ops.deduplicator.document_deduplicator — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.deduplicator.document_deduplicator

-# Some code here has been modified from:
-# https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/01a_catalogue_cleaning_and_filtering/clean_helpers/deduplication.py
-# --------------------------------------------------------
-
-import hashlib
-import string
-from collections import defaultdict
-from typing import Dict, Set
-
-import regex as re
-
-from data_juicer.utils.constant import HashKeys
-
-from ..base_op import OPERATORS, Deduplicator
-
-
-
[docs]@OPERATORS.register_module('document_deduplicator') -class DocumentDeduplicator(Deduplicator): - """ - Deduplicator to deduplicate samples at document-level using exact matching. - - Using md5 hash to deduplicate samples. - """ - - def __init__(self, - lowercase: bool = False, - ignore_non_character: bool = False, - *args, - **kwargs): - """ - Initialization method. - - :param lowercase: Whether to convert sample text to lower case - :param ignore_non_character: Whether to ignore non-alphabet - characters, including whitespaces, digits, and punctuations - :param args: extra args - :param kwargs: extra args. - """ - super().__init__(*args, **kwargs) - self.lowercase = lowercase - self.remove_non_character_regex = re.compile( - f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605 - ) if ignore_non_character else None - -
[docs] def compute_hash(self, sample): - """ - Compute md5 hash values for the sample. - - :param sample: input sample - :return: sample with md5 hash value. - """ - # check if it's computed already - if HashKeys.hash in sample: - return sample - - text = sample[self.text_key] - if self.lowercase: - text = text.lower() - if self.remove_non_character_regex: - text = self.remove_non_character_regex.sub('', text) - - def _get_hash(txt): - return hashlib.md5(txt.strip().encode('utf-8')).hexdigest() - - sample[HashKeys.hash] = _get_hash(text) - return sample
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - # no need to deduplicate because too few samples - if len(dataset) <= 1: - return dataset, {} - - dup_hashes = None - if show_num > 0: - # sample duplicate pairs - hash2ids: Dict[int, Set[int]] = defaultdict(set) - for sid, hash_val in enumerate(dataset[HashKeys.hash]): - hash2ids[hash_val].add(sid) - dup_samples = sorted(list(hash2ids.items()), - key=lambda x: len(x[1]), - reverse=True) - dup_hashes = set([ - item[0] for item in dup_samples if len(item[1]) > 1 - ][:show_num]) - - def _filter_dup_helper(sample, hashes): - hash = sample[HashKeys.hash] - if show_num > 0 and hash in dup_hashes \ - and len(dup_pairs[hash]) < 2: - # tracer is open and not enough duplicate sample pairs - dup_pairs[hash].append(sample) - if hash in hashes: - return False - else: - hashes.add(hash) - return True - - hashes = set() - dup_pairs = {hash_v: [] for hash_v in dup_hashes} if dup_hashes else {} - dataset = dataset.filter( - _filter_dup_helper, - fn_kwargs=dict(hashes=hashes), - load_from_cache_file=False if show_num > 0 else True) # num_proc=1 - return dataset, dup_pairs
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html deleted file mode 100644 index 5d6ad6f8d..000000000 --- a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html +++ /dev/null @@ -1,413 +0,0 @@ - - - - - - data_juicer.ops.deduplicator.document_minhash_deduplicator — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator

-# Some code here has been modified from:
-# https://github.com/bigcode-project/bigcode-dataset/blob/main/near_deduplication/minhash_deduplication.py
-# --------------------------------------------------------
-
-import hashlib
-import struct
-from collections import defaultdict
-
-import numpy as np
-import regex
-from jsonargparse.typing import ClosedUnitInterval, PositiveInt
-from loguru import logger
-from scipy.integrate import quad as integrate
-from tqdm import tqdm
-
-from data_juicer.utils.constant import HashKeys
-
-from ..base_op import OPERATORS, Deduplicator
-from ..common.helper_func import UnionFind, split_on_whitespace
-
-MERSENNE_PRIME = np.uint64((1 << 61) - 1)
-MAX_HASH = np.uint64((1 << 32) - 1)
-
-
-
[docs]def sha1_hash32(data): - """ - Directly taken from datasketch package to avoid dependency. - - Parameters - ---------- - data : bytes - - Returns - ------- - int - """ - return struct.unpack('<I', hashlib.sha1(data).digest()[:4])[0]
- - -
[docs]def optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float = 0.5, - false_negative_weight: float = 0.5, -): - """ - Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum - of probabilities of false positive and false negative, taken from - datasketch. - - :param threshold: float. The threshold for similarity - :param num_perm: int. The number of permutations - :param false_positive_weight: float. The weight of false positive - :param false_negative_weight: float. The weight of false negative - :return: Tuple[int, int]. The optimal `b` and `r` parameters. The number of - bands, and the number of rows per band respectively - """ - - def false_positive_probability(th: float, band: int, rows: int): - """Source: `datasketch.lsh`""" - - def proba(s): - return 1 - (1 - s**float(rows))**float(band) - - a, _ = integrate(proba, 0.0, th) - return a - - def false_negative_probability(th: float, band: int, rows: int): - """Source: `datasketch.lsh`""" - - def proba(s): - return 1 - (1 - (1 - s**float(rows))**float(band)) - - a, _ = integrate(proba, th, 1.0) - return a - - # object: minimize the weighted FP and FN ratio - min_error = float('inf') - opt = (0, 0) - for b in range(1, num_perm + 1): - max_r = int(num_perm / b) - for r in range(1, max_r + 1): - fp = false_positive_probability(threshold, b, r) - fn = false_negative_probability(threshold, b, r) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (b, r) - return opt
- - -
[docs]@OPERATORS.register_module('document_minhash_deduplicator') -class DocumentMinhashDeduplicator(Deduplicator): - """ - Deduplicator to deduplicate samples at document-level using MinHashLSH. - - Different from simhash, minhash is stored as bytes, so they won't be - kept in the final dataset. - """ - - def __init__( - self, - tokenization: str = 'space', - window_size: PositiveInt = 5, - lowercase: bool = True, - ignore_pattern: str = None, - num_permutations: PositiveInt = 256, - jaccard_threshold: ClosedUnitInterval = 0.7, - num_bands: PositiveInt = None, - num_rows_per_band: PositiveInt = None, - *args, - **kwargs, - ): - """ - Initialization method. - - :param tokenization: tokenization method for sample texts. It - should be one of [space, punctuation, character]. For - English-like languages, we recommend to use 'space'. And for - Chinese-like languages, we recommend to use 'character' - :param window_size: window size of shingling - :param lowercase: whether to convert text to lower case first - :param ignore_pattern: whether to ignore sub-strings with - specific pattern when computing minhash - :param num_permutations: number of permutations in minhash - computing - :param jaccard_threshold: the min jaccard similarity threshold - in near-duplicate detection. When the jaccard similarity of - two sample texts is >= this threshold, they are regarded as - similar samples and this op will only keep one of them after - deduplication - :param num_bands: number of bands in LSH. Default it's None, and - it will be determined by an optimal params computation - algorithm by minimize the weighted sum of probs of False - Positives and False Negatives - :param num_rows_per_band: number of rows in each band in LSH. - Default it's None, and it will be determined by an optimal - params computation algorithm - """ - super().__init__(*args, **kwargs) - # about minhash computation - self.tokenization = tokenization - self.window_size = window_size - self.lowercase = lowercase - self.ignore_pattern = ignore_pattern - if self.ignore_pattern: - self.ignore_pattern = regex.compile(self.ignore_pattern) - - # check parameters - if self.ignore_pattern and self.tokenization == 'punctuation': - logger.warning('Be careful that tokenization with punctuations ' - 'won\'t work if the ignore pattern includes ' - 'punctuations.') - - # about deduplication - self.num_permutation = num_permutations - self.jaccard_threshold = jaccard_threshold - self.num_bands = num_bands - self.num_rows_per_band = num_rows_per_band - - # initialize deduplication parameters - # check number of bands and rows - if self.num_bands is None or self.num_rows_per_band is None: - self.num_bands, self.num_rows_per_band = optimal_param( - self.jaccard_threshold, - self.num_permutation, - ) - - # compute hash ranges and create hash tables - self.hash_ranges = [(i * self.num_rows_per_band, - (i + 1) * self.num_rows_per_band) - for i in range(self.num_bands)] - self.hash_tables = [defaultdict(set) for _ in range(self.num_bands)] - - # generate permutations - gen = np.random.RandomState(seed=42) - self.perm_a, self.perm_b = np.array( - [( - gen.randint(1, MERSENNE_PRIME, dtype=np.uint64), - gen.randint(0, MERSENNE_PRIME, dtype=np.uint64), - ) for _ in range(self.num_permutation)], - dtype=np.uint64, - ).T - -
[docs] def compute_hash(self, sample): - """ - Compute minhash values for the sample. - - :param sample: input sample - :return: sample with minhash value. - """ - # check if it's computed already - if HashKeys.minhash in sample: - return sample - - text = sample[self.text_key] - - if self.lowercase: - text = text.lower() - if self.ignore_pattern: - text = self.ignore_pattern.sub('', text) - - # get tokens for different tokenization method - tokens = set() - if self.tokenization == 'character': - tokens = { - str.encode(text[i:i + self.window_size]) - for i in range(len(text) - self.window_size) - } - elif self.tokenization == 'punctuation': - tokens = self.punctuation_pattern.split(text) - tokens = { - str.encode(' '.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - } - elif self.tokenization == 'space': - tokens = split_on_whitespace(text) - tokens = { - str.encode(' '.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - } - else: - raise NotImplementedError( - f'Unimplemented tokenization method [{self.tokenization}]') - - # compute minhash value - hv = np.array([sha1_hash32(token) for token in tokens], - dtype=np.uint64) - phv = np.bitwise_and( - ((hv * np.tile(self.perm_a, - (len(hv), 1)).T).T + self.perm_b) % MERSENNE_PRIME, - MAX_HASH) - hash_values = np.vstack([ - phv, - np.ones(self.num_permutation, dtype=np.uint64) * MAX_HASH - ]).min(axis=0) - sample[HashKeys.minhash] = [ - bytes(hash_values[start:end].byteswap().data) - for start, end in self.hash_ranges - ] - return sample
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - # no need to deduplicate because too few samples - if len(dataset) <= 1: - return dataset, {} - - minhashes = dataset[HashKeys.minhash] - # remove bytes minhash column otherwise unexpected error would occur - # when exporting the processed dataset - dataset = dataset.remove_columns([HashKeys.minhash]) - - # make clusters -- construct the minhash lookup tables of seg to ids - logger.info(f'Start clustering for {len(dataset)} samples...') - batch_size = 10000 - for i in tqdm(range(0, len(minhashes), batch_size), - dynamic_ncols=True, - desc='Iterating MinHashes of samples...'): - batch = minhashes[i:i + batch_size] - for idx, hs in enumerate(batch): - for h, hashtable in zip(hs, self.hash_tables): - hashtable[h].add(idx + i) - - # using UnionFind set to union samples within the same clusters - union_find = UnionFind() - for table in tqdm(self.hash_tables, - dynamic_ncols=True, - desc='Clustering'): - for cluster in table.values(): - if len(cluster) <= 1: - continue - idx = min(cluster) - for x in cluster: - union_find.union(x, idx) - logger.info(f'There are {len(set(union_find.parent.values()))} ' - f'clusters that includes multiple near-duplicate samples.') - - # record the duplicate sample pairs - dup_pairs = {} - if show_num > 0: - for i in range(len(dataset)): - cluster_idx = union_find.find(i) - if cluster_idx not in dup_pairs and cluster_idx != i: - dup_pairs[cluster_idx] = [ - dataset[cluster_idx], - dataset[i], - ] - if len(dup_pairs) >= show_num: - break - - # filtering -- only keep those samples whose parent index is itself, - # including: - # 1. samples that form a cluster by themselves - # 2. the first sample in a cluster that includes multiple samples - def _filter_minhash_dup_helper(sample, index): - return union_find.find(index) == index - - dataset = dataset.filter( - _filter_minhash_dup_helper, - with_indices=True, - ) - logger.info(f'Keep {len(dataset)} samples after MinHash dedup.') - - return dataset, dup_pairs
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html deleted file mode 100644 index 0918f324a..000000000 --- a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html +++ /dev/null @@ -1,367 +0,0 @@ - - - - - - data_juicer.ops.deduplicator.document_simhash_deduplicator — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator

-# Some code here has been modified from:
-# https://github.com/bigscience-workshop/data-preparation
-# --------------------------------------------------------
-
-from collections import Counter, defaultdict, deque
-from typing import Dict, Set
-
-import numpy as np
-import regex
-import simhash
-from jsonargparse.typing import PositiveInt
-from loguru import logger
-
-from data_juicer.utils.constant import HashKeys
-
-from ..base_op import OPERATORS, Deduplicator
-from ..common.helper_func import split_on_whitespace
-
-
-
[docs]def local_num_differing_bits(hash_a, hash_b): - """ - Local implementation of calculating the number of different bits between - two integers. - - :param hash_a: integer hash value a - :param hash_b: integer hash value b - :return: number of different bits between input hashes. - """ - cnt = 0 - n = hash_a ^ hash_b - while n != 0: - cnt += 1 - n = n & (n - 1) - return cnt
- - -
[docs]def num_differing_bits_selector(): - """ - Select a num_differing_bits method according to the Python version - installed. - - When Python >= 3.9, the original simhash library cannot be compiled - correctly due to some changes in cython. After fixing this - incompatibility, RecursionError occurs sometimes when calling - simhash.num_differing_bits. So we use our implementation when Python - >= 3.9. Otherwise, we use implementation of simhash. - - :return: an available num_differing_bits function. - """ - import platform - a, b, _ = platform.python_version().split('.') - if a == '3' and int(b) >= 9: - # for >= 3.9, use local implementation - return local_num_differing_bits - else: - # for < 3.9, use simhash version - return simhash.num_differing_bits
- - -num_differing_bits = num_differing_bits_selector() - - -
[docs]@OPERATORS.register_module('document_simhash_deduplicator') -class DocumentSimhashDeduplicator(Deduplicator): - """Deduplicator to deduplicate samples at document-level using SimHash.""" - - def __init__(self, - tokenization: str = 'space', - window_size: PositiveInt = 6, - lowercase: bool = True, - ignore_pattern: str = None, - num_blocks: PositiveInt = 6, - hamming_distance: PositiveInt = 4, - *args, - **kwargs): - """ - Initialization method :param tokenization: tokenization method for - sample texts. - - It should be one of [space, punctuation, character]. For - English-like languages, we recommend to use 'space'. And for - Chinese-like languages, we recommend to use 'character' - - :param window_size: window size of shingling - :param lowercase: whether to convert text to lower case first - :param ignore_pattern: whether to ignore sub-strings with - specific pattern when computing simhash - :param num_blocks: number of blocks in simhash computing - :param hamming_distance: the max hamming distance threshold in - near-duplicate detection. When the hamming distance of two - sample texts is <= this threshold, they are regarded as - similar samples and this op will only keep one of them after - deduplication. This threshold should be always less than - num_blocks - """ - # about simhash computation - super().__init__(*args, **kwargs) - self.tokenization = tokenization - self.window_size = window_size - self.lowercase = lowercase - self.ignore_pattern = ignore_pattern - if self.ignore_pattern: - self.ignore_pattern = regex.compile(self.ignore_pattern) - - # check parameters - if self.ignore_pattern and self.tokenization == 'punctuation': - logger.warning('Be careful that tokenization with punctuations ' - 'won\'t work if the ignore pattern includes ' - 'punctuations.') - - # about deduplication - self.num_blocks = num_blocks - self.hamming_distance = hamming_distance - -
[docs] def compute_hash(self, sample): - """ - Compute simhash values for the sample. - - :param sample: input sample - :return: sample with simhash value. - """ - # check if it's computed already - if HashKeys.simhash in sample: - return sample - - text = sample[self.text_key] - - if self.lowercase: - text = text.lower() - if self.ignore_pattern: - text = self.ignore_pattern.sub('', text) - - # get tokens for different tokenization method - tokens = [] - if self.tokenization == 'character': - tokens = [ - str.encode(text[i:i + self.window_size]) - for i in range(len(text) - self.window_size) - ] - elif self.tokenization == 'punctuation': - tokens = self.punctuation_pattern.split(text) - tokens = [ - str.encode(' '.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - ] - elif self.tokenization == 'space': - tokens = split_on_whitespace(text) - tokens = [ - str.encode(' '.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - ] - else: - raise NotImplementedError( - f'Unimplemented tokenization method [{self.tokenization}]') - - # compute simhash - sample[HashKeys.simhash] = np.uint64( - simhash.compute(map(simhash.unsigned_hash, tokens))) - return sample
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - # no need to deduplicate because too few samples - if len(dataset) <= 1: - return dataset, {} - - # find matches - logger.info(f'Start querying {len(dataset)} samples.') - matches = simhash.find_all( - dataset[HashKeys.simhash], - self.num_blocks, - self.hamming_distance, - ) - logger.info(f'Querying done, found {len(matches)} matches.') - - # compute hash diff distribution - graph = defaultdict(dict) - dist = Counter() - for x, y in matches: - graph[x][y] = graph[y][x] = True - num_diff = num_differing_bits(x, y) - dist[num_diff] += 1 - logger.info(f'Hash diff distribution: {dist}') - - hash2ids: Dict[int, Set[str]] = defaultdict(set) - hashes: Set[int] = set(dataset[HashKeys.simhash]) - hash2cluster: Dict[int, int] = {} - visited: Set[int] = set() - cluster_id: int = 0 - - for sid, hash_val in enumerate(dataset[HashKeys.simhash]): - hash2ids[hash_val].add(str(sid)) - - # clustering - dup_pairs = {} # store duplicate pairs when show_num > 0 - while hashes: - hash_val = hashes.pop() - if hash_val in visited: - continue - - # if this hash value is not in the matches list, it's regarded as a - # single cluster - if hash_val not in graph: - continue - - # Otherwise, BFS to find the cluster - q = deque([hash_val]) - visited.add(hash_val) - hash2cluster[hash_val] = cluster_id - if show_num > 0 and len(dup_pairs) < show_num: - dup_pairs[cluster_id] = [] - - while q: - curr = q.popleft() - for neighbor in graph[curr]: - if neighbor in visited: - continue - visited.add(neighbor) - q.append(neighbor) - hash2cluster[neighbor] = cluster_id - - cluster_id += 1 - logger.info(f'Found {cluster_id} clusters and {len(graph)} hashes.') - - # filter duplicated samples - # NOTICE: For now, we only keep the first sample in a cluster. Maybe - # there are some better strategies later. - def _filter_simhash_dup_helper(sample, visited_clusters, - visited_hashes): - sample_hash_val = sample[HashKeys.simhash] - if sample_hash_val not in hash2cluster: - # single-sample cluster, we need to check hash value still. - if sample_hash_val in visited_hashes: - return False - else: - visited_hashes.add(sample_hash_val) - return True - else: - cluster_num = hash2cluster[sample_hash_val] - if show_num > 0 and cluster_num in dup_pairs \ - and len(dup_pairs[cluster_num]) < 2: - dup_pairs[cluster_num].append(sample) - # regular cluster, check cluster number. - if cluster_num in visited_clusters: - return False - else: - visited_clusters.add(cluster_num) - return True - - cluster_record = set() - hash_record = set() - dataset = dataset.filter( - _filter_simhash_dup_helper, - fn_kwargs=dict(visited_clusters=cluster_record, - visited_hashes=hash_record), - load_from_cache_file=False if show_num > 0 else True) - logger.info(f'Keep {len(dataset)} samples after SimHash dedup.') - - return dataset, dup_pairs
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/alphanumeric_filter.html b/_modules/data_juicer/ops/filter/alphanumeric_filter.html deleted file mode 100644 index a7093eeca..000000000 --- a/_modules/data_juicer/ops/filter/alphanumeric_filter.html +++ /dev/null @@ -1,183 +0,0 @@ - - - - - - data_juicer.ops.filter.alphanumeric_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.alphanumeric_filter

-import sys
-
-from jsonargparse.typing import PositiveFloat
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ..base_op import OPERATORS, Filter
-from ..common import get_words_from_document
-
-
-
[docs]@OPERATORS.register_module('alphanumeric_filter') -class AlphanumericFilter(Filter): - """Filter to keep samples with alphabet/numeric ratio within a specific - range.""" - - def __init__(self, - tokenization: bool = False, - min_ratio: float = 0.25, - max_ratio: PositiveFloat = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param tokenization: Whether to count the ratio of alphanumeric - to the total number of tokens. if tokenization=False, it - will count the ratio of alphanumeric to the total number of - characters. - :param min_ratio: The min filter ratio in alphanumeric op, - samples will be filtered if their alphabet/numeric ratio is - below this parameter. - :param max_ratio: The max filter ratio in alphanumeric op, - samples will be filtered if their alphabet/numeric ratio - exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.tokenization = tokenization - self.min_ratio = min_ratio - self.max_ratio = max_ratio - self.model_key = None - - if tokenization: - self.model_key = prepare_model( - model_type='huggingface', - model_key='EleutherAI/pythia-6.9b-deduped') - -
[docs] def compute_stats(self, sample): - if self.tokenization: - if StatsKeys.alpha_token_ratio in sample[Fields.stats]: - return sample - alpha_count = sum( - map(lambda char: 1 - if char.isalpha() else 0, sample[self.text_key])) - tokenizer = get_model(self.model_key, model_type='huggingface') - token_count = len( - get_words_from_document( - sample[self.text_key], - token_func=tokenizer.tokenize if tokenizer else None)) - sample[Fields.stats][StatsKeys.alpha_token_ratio] = ( - alpha_count / token_count) if token_count != 0 else 0.0 - else: - if StatsKeys.alnum_ratio in sample[Fields.stats]: - return sample - alnum_count = sum( - map(lambda char: 1 - if char.isalnum() else 0, sample[self.text_key])) - sample[Fields.stats][StatsKeys.alnum_ratio] = ( - alnum_count / len(sample[self.text_key])) if len( - sample[self.text_key]) != 0 else 0.0 - return sample
- -
[docs] def process(self, sample): - ratio = sample[Fields.stats][ - StatsKeys.alpha_token_ratio] if self.tokenization else sample[ - Fields.stats][StatsKeys.alnum_ratio] - if self.min_ratio <= ratio <= self.max_ratio: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/average_line_length_filter.html b/_modules/data_juicer/ops/filter/average_line_length_filter.html deleted file mode 100644 index fb32b60f3..000000000 --- a/_modules/data_juicer/ops/filter/average_line_length_filter.html +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - data_juicer.ops.filter.average_line_length_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.average_line_length_filter

-import sys
-
-from jsonargparse.typing import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys, InterVars
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_LINES
-
-
-
[docs]@OPERATORS.register_module('average_line_length_filter') -@INTER_LINES.register_module('average_line_length_filter') -class AverageLineLengthFilter(Filter): - """Filter to keep samples with average line length within a specific - range.""" - - def __init__(self, - min_len: PositiveInt = 10, - max_len: PositiveInt = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param min_len: The min filter length in this op, samples will - be filtered if their average line length is below this - parameter. - :param max_len: The max filter length in this op, samples will - be filtered if their average line length exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_len = min_len - self.max_len = max_len - -
[docs] def compute_stats(self, sample, context=False): - # check if it's computed already - if StatsKeys.avg_line_length in sample[Fields.stats]: - return sample - - context_key = f'{InterVars.lines}' - if context and context_key in sample[Fields.context]: - lines = sample[Fields.context][context_key] - else: - lines = sample[self.text_key].splitlines() - if context: - sample[Fields.context][context_key] = lines - sample[Fields.stats][StatsKeys.avg_line_length] = \ - len(sample[self.text_key]) / len(lines) \ - if len(lines) != 0 else 0.0 - return sample
- -
[docs] def process(self, sample): - if self.min_len <= sample[Fields.stats][ - StatsKeys.avg_line_length] <= self.max_len: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/character_repetition_filter.html b/_modules/data_juicer/ops/filter/character_repetition_filter.html deleted file mode 100644 index 675915560..000000000 --- a/_modules/data_juicer/ops/filter/character_repetition_filter.html +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - data_juicer.ops.filter.character_repetition_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.character_repetition_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-import numpy as np
-from jsonargparse.typing import ClosedUnitInterval, PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('character_repetition_filter') -class CharacterRepetitionFilter(Filter): - """Filter to keep samples with char-level n-gram repetition ratio within a - \ specific range.""" - - def __init__(self, - rep_len: PositiveInt = 10, - min_ratio: ClosedUnitInterval = 0.0, - max_ratio: ClosedUnitInterval = 0.5, - *args, - **kwargs): - """ - Initialization method. - - :param rep_len: Repetition length for char-level n-gram. - :param min_ratio: The min filter ratio in this op, samples will - be filtered if their char-level n-gram repetition ratio is - below this parameter. - :param max_ratio: The max filter ratio in this op, samples will - be filtered if their char-level n-gram repetition ratio - exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.n = rep_len - self.min_ratio = min_ratio - self.max_ratio = max_ratio - -
[docs] def compute_stats(self, sample): - # check if it's computed already - if StatsKeys.char_rep_ratio in sample[Fields.stats]: - return sample - - char_ngrams = [ - sample[self.text_key][i:i + self.n] - for i in range(len(sample[self.text_key]) - self.n + 1) - ] - freq_char_ngrams = {} - for char_ngram in char_ngrams: - freq_char_ngrams[char_ngram] = ( - freq_char_ngrams.get(char_ngram, 0) + 1) - - if len(freq_char_ngrams) == 0: - sample[Fields.stats][StatsKeys.char_rep_ratio] = 0.0 - return sample - - freq_char_ngrams = sorted(list(freq_char_ngrams.values()), - reverse=True) - rep_more_than_one = len([el for el in freq_char_ngrams if el > 1]) - num_rep_char_ngrams = min( - int(np.sqrt(len(freq_char_ngrams))), - len(freq_char_ngrams) - rep_more_than_one, - ) - sample[Fields.stats][StatsKeys.char_rep_ratio] = (sum( - freq_char_ngrams[:num_rep_char_ngrams]) / sum(freq_char_ngrams)) \ - if sum(freq_char_ngrams) != 0 else 0.0 - return sample
- -
[docs] def process(self, sample): - if self.min_ratio <= sample[Fields.stats][StatsKeys.char_rep_ratio] \ - <= self.max_ratio: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/flagged_words_filter.html b/_modules/data_juicer/ops/filter/flagged_words_filter.html deleted file mode 100644 index ae479c942..000000000 --- a/_modules/data_juicer/ops/filter/flagged_words_filter.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - - - data_juicer.ops.filter.flagged_words_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.flagged_words_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from jsonargparse.typing import ClosedUnitInterval, List
-
-from data_juicer.utils.constant import Fields, StatsKeys, InterVars
-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ...utils.asset_utils import ASSET_DIR, load_words_asset
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_WORDS
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      words_refinement)
-
-
-
[docs]@OPERATORS.register_module('flagged_words_filter') -@INTER_WORDS.register_module('flagged_words_filter') -class FlaggedWordFilter(Filter): - """Filter to keep samples with flagged-word ratio less than a specific max - value.""" - - def __init__(self, - lang: str = 'en', - tokenization: bool = False, - max_ratio: ClosedUnitInterval = 0.045, - flagged_words_dir: str = ASSET_DIR, - use_words_aug: bool = False, - words_aug_group_sizes: List = [2], - words_aug_join_char: str = '', - *args, - **kwargs): - """ - Initialization method. - - :param lang: Consider flagged words in what language. If lang == - "all", we will adopt the one merged from all the available - languages - :param tokenization: Whether to use model to tokenize documents - :param max_ratio: The max filter ratio in this op. - :param flagged_words_dir: The directory storing the - flagged_words file(s) whose name includes "flagged_words" - and in json format - :param use_words_aug: Whether to augment words, especially for - Chinese and Vietnamese - :param words_aug_group_sizes: The group size of words to augment - :param words_aug_join_char: The join char between words to - augment - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.lang = lang - self.max_ratio = max_ratio - self.use_words_aug = use_words_aug - self.words_aug_group_sizes = words_aug_group_sizes - self.words_aug_join_char = words_aug_join_char - self.model_key = None - self.lang = lang - - self.FLAGGED_WORDS = load_words_asset(words_dir=flagged_words_dir, - words_type='flagged_words') - - if 'all' not in self.FLAGGED_WORDS: - self.FLAGGED_WORDS['all'] = [ - val for vals in self.FLAGGED_WORDS.values() for val in vals - ] - if tokenization: - self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') - -
[docs] def compute_stats(self, sample, context=False): - # check if it's computed already - if StatsKeys.flagged_words_ratio in sample[Fields.stats]: - return sample - - # try to get words from context - words_key = f'{InterVars.words}-{self.model_key}' - if context and words_key in sample[Fields.context]: - words = sample[Fields.context][words_key] - else: - tokenizer = get_model(self.model_key, lang=self.lang, - model_type='sentencepiece') - words = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.encode_as_pieces if tokenizer else None) - if context: - sample[Fields.context][words_key] = words - - # try to get refined words from context - refined_words_key = f'{InterVars.refined_words}-True-SPECIAL_CHARS-' \ - f'{self.use_words_aug}-' \ - f'{self.words_aug_group_sizes}-' \ - f'{self.words_aug_join_char}' - if context and refined_words_key in sample[Fields.context]: - words = sample[Fields.context][refined_words_key] - else: - words = words_refinement( - words, - lower_case=True, - strip_chars=SPECIAL_CHARACTERS, - use_words_aug=self.use_words_aug, - words_aug_group_sizes=self.words_aug_group_sizes, - words_aug_join_char=self.words_aug_join_char) - if context: - sample[Fields.context][refined_words_key] = words - - flagged_words_ratio = (len( - [word - for word in words if word in self.FLAGGED_WORDS[self.lang]]) / - len(words)) if len(words) != 0 else 0.0 - - if flagged_words_ratio > 1.0: - flagged_words_ratio = 1.0 - - sample[Fields.stats][ - StatsKeys.flagged_words_ratio] = flagged_words_ratio - return sample
- -
[docs] def process(self, sample): - return sample[Fields.stats][ - StatsKeys.flagged_words_ratio] <= self.max_ratio
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/language_id_score_filter.html b/_modules/data_juicer/ops/filter/language_id_score_filter.html deleted file mode 100644 index 2ffcbafc5..000000000 --- a/_modules/data_juicer/ops/filter/language_id_score_filter.html +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - data_juicer.ops.filter.language_id_score_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.language_id_score_filter

-from jsonargparse.typing import ClosedUnitInterval
-from loguru import logger
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('language_id_score_filter') -class LanguageIDScoreFilter(Filter): - """Filter to keep samples in a specific language with confidence score - larger than a specific min value.""" - - def __init__(self, - lang: str = '', - min_score: ClosedUnitInterval = 0.8, - *args, - **kwargs): - """ - Initialization method. - - :param lang: Samples in which language to keep. - :param min_score: The min language identification confidence - scores of samples to keep. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.lang = lang - self.min_score = min_score - self.model_key = prepare_model(lang=lang, model_type='fasttext') - -
[docs] def compute_stats(self, sample): - # check if it's computed already - if StatsKeys.lang in sample[ - Fields.stats] and StatsKeys.lang_score in sample[Fields.stats]: - return sample - - text = sample[self.text_key].lower().replace('\n', ' ') - ft_model = get_model(self.model_key, lang=self.lang, model_type='fasttext') - if ft_model is None: - err_msg = 'Model not loaded. Please retry later.' - logger.error(err_msg) - raise ValueError(err_msg) - pred = ft_model.predict(text) - lang_id = pred[0][0].replace('__label__', '') - lang_score = pred[1][0] - - sample[Fields.stats][StatsKeys.lang] = lang_id - sample[Fields.stats][StatsKeys.lang_score] = lang_score - - return sample
- -
[docs] def process(self, sample): - if self.lang: - return sample[Fields.stats][StatsKeys.lang] == self.lang \ - and sample[Fields.stats][StatsKeys.lang_score] >= self.min_score - else: - return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html b/_modules/data_juicer/ops/filter/maximum_line_length_filter.html deleted file mode 100644 index dafe68c56..000000000 --- a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - data_juicer.ops.filter.maximum_line_length_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.maximum_line_length_filter

-import sys
-
-from jsonargparse.typing import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys, InterVars
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_LINES
-
-
-
[docs]@OPERATORS.register_module('maximum_line_length_filter') -@INTER_LINES.register_module('maximum_line_length_filter') -class MaximumLineLengthFilter(Filter): - """Filter to keep samples with maximum line length within a specific - range.""" - - def __init__(self, - min_len: PositiveInt = 10, - max_len: PositiveInt = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param min_len: The min filter length in this op, samples will - be filtered if their maximum line length is below this - parameter. - :param max_len: The max filter length in this op, samples will - be filtered if their maximum line length exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_len = min_len - self.max_len = max_len - -
[docs] def compute_stats(self, sample, context=False): - # check if it's computed already - if StatsKeys.max_line_length in sample[Fields.stats]: - return sample - - context_key = f'{InterVars.lines}' - if context and context_key in sample[Fields.context]: - lines = sample[Fields.context][context_key] - else: - lines = sample[self.text_key].splitlines() - if context: - sample[Fields.context][context_key] = lines - line_lengths = list(map(len, lines)) - sample[Fields.stats][StatsKeys.max_line_length] = max( - line_lengths) if line_lengths else 0.0 - return sample
- -
[docs] def process(self, sample): - if self.min_len <= sample[Fields.stats][ - StatsKeys.max_line_length] <= self.max_len: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/perplexity_filter.html b/_modules/data_juicer/ops/filter/perplexity_filter.html deleted file mode 100644 index 3d52be318..000000000 --- a/_modules/data_juicer/ops/filter/perplexity_filter.html +++ /dev/null @@ -1,172 +0,0 @@ - - - - - - data_juicer.ops.filter.perplexity_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.perplexity_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from jsonargparse.typing import PositiveFloat
-
-from data_juicer.utils.constant import Fields, StatsKeys, InterVars
-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_WORDS
-from ..common import get_words_from_document
-
-
-
[docs]@OPERATORS.register_module('perplexity_filter') -@INTER_WORDS.register_module('perplexity_filter') -class PerplexityFilter(Filter): - """Filter to keep samples with perplexity score less than a specific max - value.""" - - def __init__(self, - lang: str = 'en', - max_ppl: PositiveFloat = 1500, - *args, - **kwargs): - """ - Initialization method. - - :param lang: Compute perplexity for samples in which language. - :param max_ppl: The max filter perplexity in this op, samples - will be filtered if their perplexity exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.max_ppl = max_ppl - self.lang = lang - self.sp_model_key = prepare_model(lang=lang, - model_type='sentencepiece') - self.kl_model_key = prepare_model(lang=lang, model_type='kenlm') - -
[docs] def compute_stats(self, sample, context=False): - # check if it's computed already - if StatsKeys.perplexity in sample[Fields.stats]: - return sample - - # tokenization - words_key = f'{InterVars.words}-{self.sp_model_key}' - if context and words_key in sample[Fields.context]: - words = sample[Fields.context][words_key] - else: - tokenizer = get_model(self.sp_model_key, self.lang, 'sentencepiece') - words = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.encode_as_pieces if tokenizer else None) - if context: - sample[Fields.context][words_key] = words - text = ' '.join(words) - # compute perplexity - logits, length = 0, 0 - kenlm_model = get_model(self.kl_model_key, self.lang, 'kenlm') - for line in text.splitlines(): - logits += kenlm_model.score(line) - length += (len(line.split()) + 1) - ppl = (10.0**(-logits / length)) if length != 0 else 0.0 - sample[Fields.stats][StatsKeys.perplexity] = round(ppl, 1) - - return sample
- -
[docs] def process(self, sample): - return sample[Fields.stats][StatsKeys.perplexity] <= self.max_ppl
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/special_characters_filter.html b/_modules/data_juicer/ops/filter/special_characters_filter.html deleted file mode 100644 index d2a5e6767..000000000 --- a/_modules/data_juicer/ops/filter/special_characters_filter.html +++ /dev/null @@ -1,158 +0,0 @@ - - - - - - data_juicer.ops.filter.special_characters_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.special_characters_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from jsonargparse.typing import ClosedUnitInterval
-
-from data_juicer.utils.constant import Fields, StatsKeys
-
-from ..base_op import OPERATORS, Filter
-from ..common import SPECIAL_CHARACTERS
-
-
-
[docs]@OPERATORS.register_module('special_characters_filter') -class SpecialCharactersFilter(Filter): - """Filter to keep samples with special-char ratio within a specific - range.""" - - def __init__(self, - min_ratio: ClosedUnitInterval = 0.0, - max_ratio: ClosedUnitInterval = 0.25, - *args, - **kwargs): - """ - Initialization method. - - :param min_ratio: The min filter ratio in this op, samples will - be filtered if their special-char ratio is below this - parameter. - :param max_ratio: The max filter ratio in this op, samples will - be filtered if their special-char ratio exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_ratio = min_ratio - self.max_ratio = max_ratio - -
[docs] def compute_stats(self, sample): - # check if it's computed already - if StatsKeys.special_char_ratio in sample[Fields.stats]: - return sample - - # get ratio of special characters - sample[Fields.stats][StatsKeys.special_char_ratio] = ( - len([c - for c in sample[self.text_key] if c in SPECIAL_CHARACTERS]) / - len(sample[self.text_key])) if len( - sample[self.text_key]) != 0 else 0.0 - return sample
- -
[docs] def process(self, sample): - if self.min_ratio <= sample[Fields.stats][StatsKeys.special_char_ratio] \ - <= self.max_ratio: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/specified_field_filter.html b/_modules/data_juicer/ops/filter/specified_field_filter.html deleted file mode 100644 index 76d98d2f1..000000000 --- a/_modules/data_juicer/ops/filter/specified_field_filter.html +++ /dev/null @@ -1,156 +0,0 @@ - - - - - - data_juicer.ops.filter.specified_field_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.specified_field_filter

-from typing import List, Tuple, Union
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('specified_field_filter') -class SpecifiedFieldFilter(Filter): - """ - Filter based on specified field information. - - If the specified field information in the sample is not within the - specified target value, the sample will be filtered. - """ - - def __init__(self, - field_key: str = '', - target_value: Union[List, Tuple] = [], - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Filter based on the specified value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param target_value: The range of specified field information - corresponding to the samples that need to be retained. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.target_value = target_value - -
[docs] def compute_stats(self, sample): - return sample
- -
[docs] def process(self, sample): - if not (self.field_key and self.target_value): - return True - - field_value = sample - for key in self.field_key.split('.'): - assert key in field_value.keys(), "'{}' not in {}".format( - key, field_value.keys()) - field_value = field_value[key] - - if not (isinstance(field_value, list) - or isinstance(field_value, tuple)): - field_value = [field_value] - for value in field_value: - if value not in self.target_value: - return False - return True
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html b/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html deleted file mode 100644 index 3a8325c2c..000000000 --- a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html +++ /dev/null @@ -1,170 +0,0 @@ - - - - - - data_juicer.ops.filter.specified_numeric_field_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.specified_numeric_field_filter

-import sys
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]def is_number(s): - if s: - try: - float(s) - return True - except ValueError: - pass - return False
- - -
[docs]@OPERATORS.register_module('specified_numeric_field_filter') -class SpecifiedNumericFieldFilter(Filter): - """ - Filter based on specified numeric field information. - - If the specified numeric information in the sample is not within the - specified range, the sample will be filtered. - """ - - def __init__(self, - field_key: str = '', - min_value: float = -sys.maxsize, - max_value: float = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Filter based on the specified numeric value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param min_value: The min filter value in SpecifiedNumericField - op, samples will be filtered if their specified numeric - field value is below this parameter. - :param max_value: The max filter value in SpecifiedNumericField - op, samples will be filtered if their specified numeric - field value exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.min_value = min_value - self.max_value = max_value - -
[docs] def compute_stats(self, sample): - return sample
- -
[docs] def process(self, sample): - if not self.field_key: - return True - - field_value = sample - for key in self.field_key.split('.'): - assert key in field_value.keys(), "'{}' not in {}".format( - key, field_value.keys()) - field_value = field_value[key] - - if is_number(field_value): - field_value = float(field_value) - return self.min_value <= field_value <= self.max_value - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/stopwords_filter.html b/_modules/data_juicer/ops/filter/stopwords_filter.html deleted file mode 100644 index b1e25c03b..000000000 --- a/_modules/data_juicer/ops/filter/stopwords_filter.html +++ /dev/null @@ -1,221 +0,0 @@ - - - - - - data_juicer.ops.filter.stopwords_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.stopwords_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from jsonargparse.typing import ClosedUnitInterval, List
-
-from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset
-from data_juicer.utils.constant import Fields, StatsKeys, InterVars
-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_WORDS
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      words_refinement)
-
-
-
[docs]@OPERATORS.register_module('stopwords_filter') -@INTER_WORDS.register_module('stopwords_filter') -class StopWordsFilter(Filter): - """Filter to keep samples with stopword ratio larger than a specific min - value.""" - - def __init__(self, - lang: str = 'en', - tokenization: bool = False, - min_ratio: ClosedUnitInterval = 0.3, - stopwords_dir: str = ASSET_DIR, - use_words_aug: bool = False, - words_aug_group_sizes: List = [2], - words_aug_join_char: str = '', - *args, - **kwargs): - """ - Initialization method. - - :param lang: Consider stopwords in what language. If lang == - "all", we will adopt the one merged from all the available - languages - :param tokenization: whether to use model to tokenize documents - :param min_ratio: The min filter ratio in this op. - :param stopwords_dir: The directory storing the stopwords - file(s) whose name includes "stopwords" and in json format - :param use_words_aug: Whether to augment words, especially for - Chinese and Vietnamese - :param words_aug_group_sizes: The group size of words to augment - :param words_aug_join_char: The join char between words to - augment - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.lang = lang - self.min_ratio = min_ratio - self.use_words_aug = use_words_aug - self.words_aug_group_sizes = words_aug_group_sizes - self.words_aug_join_char = words_aug_join_char - self.model_key = None - self.lang = lang - - self.STOPWORDS = load_words_asset(words_dir=stopwords_dir, - words_type='stopwords') - if 'all' not in self.STOPWORDS: - self.STOPWORDS['all'] = [ - val for vals in self.STOPWORDS.values() for val in vals - ] - if tokenization: - self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') - -
[docs] def compute_stats(self, sample, context=False): - # check if it's computed already - if StatsKeys.stopwords_ratio in sample[Fields.stats]: - return sample - - # try to get words from context - words_key = f'{InterVars.words}-{self.model_key}' - if context and words_key in sample[Fields.context]: - words = sample[Fields.context][words_key] - else: - tokenizer = get_model(self.model_key, lang=self.lang, - model_type='sentencepiece') - words = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.encode_as_pieces if tokenizer else None) - if context: - sample[Fields.context][words_key] = words - - # try to get refined words from context - refined_words_key = f'{InterVars.refined_words}-True-SPECIAL_CHARS-' \ - f'{self.use_words_aug}-' \ - f'{self.words_aug_group_sizes}-' \ - f'{self.words_aug_join_char}' - if context and refined_words_key in sample[Fields.context]: - words = sample[Fields.context][refined_words_key] - else: - words = words_refinement( - words, - lower_case=True, - strip_chars=SPECIAL_CHARACTERS, - use_words_aug=self.use_words_aug, - words_aug_group_sizes=self.words_aug_group_sizes, - words_aug_join_char=self.words_aug_join_char) - if context: - sample[Fields.context][refined_words_key] = words - - stopwords_ratio = ( - len([word for word in words - if word in self.STOPWORDS[self.lang]]) - / len(words)) \ - if len(words) != 0 else 0.0 - - if stopwords_ratio > 1.0: - stopwords_ratio = 1.0 - - sample[Fields.stats][StatsKeys.stopwords_ratio] = stopwords_ratio - return sample
- -
[docs] def process(self, sample): - return sample[Fields.stats][ - StatsKeys.stopwords_ratio] >= self.min_ratio
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/suffix_filter.html b/_modules/data_juicer/ops/filter/suffix_filter.html deleted file mode 100644 index 8c9cafa71..000000000 --- a/_modules/data_juicer/ops/filter/suffix_filter.html +++ /dev/null @@ -1,143 +0,0 @@ - - - - - - data_juicer.ops.filter.suffix_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.suffix_filter

-from typing import List, Tuple, Union
-
-from data_juicer.utils.constant import Fields
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('suffix_filter') -class SuffixFilter(Filter): - """Filter to keep samples with specified suffix.""" - - def __init__(self, - suffixes: Union[str, List[str], Tuple[str]] = [], - *args, - **kwargs): - """ - Initialization method. - - :param suffixes: the suffix of text that will be keep. - For example: '.txt', 'txt' or ['txt', '.pdf', 'docx'] - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if suffixes is None: - self.suffixes = [] - elif isinstance(suffixes, str): - self.suffixes = [suffixes] - else: - self.suffixes = suffixes - -
[docs] def compute_stats(self, sample): - return sample
- -
[docs] def process(self, sample): - if self.suffixes: - if sample[Fields.suffix] in self.suffixes: - return True - else: - return False - else: - return True
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/text_length_filter.html b/_modules/data_juicer/ops/filter/text_length_filter.html deleted file mode 100644 index 2e1fd6053..000000000 --- a/_modules/data_juicer/ops/filter/text_length_filter.html +++ /dev/null @@ -1,150 +0,0 @@ - - - - - - data_juicer.ops.filter.text_length_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.text_length_filter

-import sys
-
-from jsonargparse.typing import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('text_length_filter') -class TextLengthFilter(Filter): - """Filter to keep samples with total text length within a specific - range.""" - - def __init__(self, - min_len: PositiveInt = 10, - max_len: PositiveInt = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param min_len: The min text length in the filtering. samples - will be filtered if their text length is below this - parameter. - :param max_len: The max text length in the filtering. samples - will be filtered if their text length exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_len = min_len - self.max_len = max_len - -
[docs] def compute_stats(self, sample): - # check if it's computed already - if StatsKeys.text_len in sample[Fields.stats]: - return sample - - sample[Fields.stats][StatsKeys.text_len] = len(sample[self.text_key]) - return sample
- -
[docs] def process(self, sample): - if self.min_len <= sample[Fields.stats][ - StatsKeys.text_len] <= self.max_len: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/word_num_filter.html b/_modules/data_juicer/ops/filter/word_num_filter.html deleted file mode 100644 index 3b198b979..000000000 --- a/_modules/data_juicer/ops/filter/word_num_filter.html +++ /dev/null @@ -1,177 +0,0 @@ - - - - - - data_juicer.ops.filter.word_num_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.word_num_filter

-import sys
-
-from jsonargparse.typing import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys, InterVars
-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_WORDS
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      words_refinement)
-
-
-
[docs]@OPERATORS.register_module('words_num_filter') -@INTER_WORDS.register_module('words_num_filter') -class WordNumFilter(Filter): - """Filter to keep samples with total words number within a specific - range.""" - - def __init__(self, - lang: str = 'en', - tokenization: bool = False, - min_num: PositiveInt = 10, - max_num: PositiveInt = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param lang: sample in which language. - :param tokenization: whether to use model to tokenize documents - :param min_num: The min filter word number in this op, samples - will be filtered if their word number is below this - parameter. - :param max_num: The max filter word number in this op, samples - will be filtered if their word number exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_num = min_num - self.max_num = max_num - self.model_key = None - self.lang = lang - - if tokenization: - self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') - -
[docs] def compute_stats(self, sample, context=False): - # check if it's computed already - if StatsKeys.num_words in sample[Fields.stats]: - return sample - - words_key = f'{InterVars.words}-{self.model_key}' - if context and words_key in sample[Fields.context]: - words = sample[Fields.context][words_key] - else: - tokenizer = get_model(self.model_key, lang=self.lang, - model_type='sentencepiece') - words = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.encode_as_pieces if tokenizer else None) - if context: - sample[Fields.context][words_key] = words - words = words_refinement(words, strip_chars=SPECIAL_CHARACTERS) - sample[Fields.stats][StatsKeys.num_words] = len(words) - return sample
- -
[docs] def process(self, sample): - if self.min_num <= sample[Fields.stats][ - StatsKeys.num_words] <= self.max_num: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/word_repetition_filter.html b/_modules/data_juicer/ops/filter/word_repetition_filter.html deleted file mode 100644 index 57eeff82f..000000000 --- a/_modules/data_juicer/ops/filter/word_repetition_filter.html +++ /dev/null @@ -1,212 +0,0 @@ - - - - - - data_juicer.ops.filter.word_repetition_filter — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.filter.word_repetition_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from jsonargparse.typing import ClosedUnitInterval, PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys, InterVars
-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_WORDS
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      words_refinement)
-
-
-
[docs]@OPERATORS.register_module('word_repetition_filter') -@INTER_WORDS.register_module('word_repetition_filter') -class WordRepetitionFilter(Filter): - """Filter to keep samples with word-level n-gram repetition ratio within a - \ specific range.""" - - def __init__(self, - lang: str = 'en', - tokenization: bool = False, - rep_len: PositiveInt = 10, - min_ratio: ClosedUnitInterval = 0.0, - max_ratio: ClosedUnitInterval = 0.5, - *args, - **kwargs): - """ - Initialization method. - - :param lang: sample in which language. - :param tokenization: whether to use model to tokenize documents - :param rep_len: Repetition length for word-level n-gram. - :param min_ratio: The min filter ratio in this op, samples will - be filtered if their word-level n-gram repetition ratio is - below this parameter. - :param max_ratio: The max filter ratio in this op, samples will - be filtered if their word-level n-gram repetition ratio - exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.n = rep_len - self.min_ratio = min_ratio - self.max_ratio = max_ratio - self.model_key = None - self.lang = lang - - if tokenization: - self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') - -
[docs] def compute_stats(self, sample, context=False): - # check if it's computed already - if StatsKeys.word_rep_ratio in sample[Fields.stats]: - return sample - - # try to get words from context - words_key = f'{InterVars.words}-{self.model_key}' - if context and words_key in sample[Fields.context]: - words = sample[Fields.context][words_key] - else: - tokenizer = get_model(self.model_key, lang=self.lang, - model_type='sentencepiece') - words = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.encode_as_pieces if tokenizer else None) - if context: - sample[Fields.context][words_key] = words - - # try to get refined words from context - refined_words_key = f'{InterVars.refined_words}-True-SPECIAL_CHARS-' \ - f'False-[2]-' - if context and refined_words_key in sample[Fields.context]: - words = sample[Fields.context][refined_words_key] - else: - words = words_refinement( - words, - lower_case=True, - strip_chars=SPECIAL_CHARACTERS) - if context: - sample[Fields.context][refined_words_key] = words - word_ngrams = [ - ' '.join(words[i:i + self.n]) - for i in range(len(words) - self.n + 1) - ] - freq_word_ngrams = {} - for word_ngram in word_ngrams: - freq_word_ngrams[word_ngram] = ( - freq_word_ngrams.get(word_ngram, 0) + 1) - - if len(freq_word_ngrams) == 0: - sample[Fields.stats][StatsKeys.word_rep_ratio] = 0.0 - return sample - - freq_word_ngrams = list(freq_word_ngrams.values()) - rep_more_than_one = [freq for freq in freq_word_ngrams if freq > 1] - sample[Fields.stats][StatsKeys.word_rep_ratio] = ( - sum(rep_more_than_one) / - sum(freq_word_ngrams)) if sum(freq_word_ngrams) != 0 else 0.0 - return sample
- -
[docs] def process(self, sample): - if self.min_ratio <= sample[Fields.stats][StatsKeys.word_rep_ratio] \ - <= self.max_ratio: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/load.html b/_modules/data_juicer/ops/load.html deleted file mode 100644 index 5f1fc2c62..000000000 --- a/_modules/data_juicer/ops/load.html +++ /dev/null @@ -1,124 +0,0 @@ - - - - - - data_juicer.ops.load — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.load

-from .base_op import OPERATORS
-from .op_fusion import fuse_operators
-
-
[docs]def load_ops(process_list, op_fusion=False): - """ - Load op list according to the process list from config file. - - :param process_list: A process list. Each item is an op name and its - arguments. - :param op_fusion: whether to fuse ops that share the same intermediate - variables. - :return: The op instance list. - """ - ops = [] - for process in process_list: - op_name, args = list(process.items())[0] - ops.append(OPERATORS.modules[op_name](**args)) - - # detect filter groups - if op_fusion: - process_list, ops = fuse_operators(process_list, ops) - - return process_list, ops
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html b/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html deleted file mode 100644 index f9874d929..000000000 --- a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html +++ /dev/null @@ -1,156 +0,0 @@ - - - - - - data_juicer.ops.mapper.clean_copyright_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_copyright_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
-# --------------------------------------------------------
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_copyright_mapper') -class CleanCopyrightMapper(Mapper): - """Mapper to clean copyright comments at the beginning of the text - samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pat = re.compile('/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/') - self.cpat = re.compile('copyright', re.IGNORECASE) - -
[docs] def process(self, sample): - - r = self.pat.search(sample[self.text_key]) - if r: - # found one, now see if it contains "copyright", if so strip it - span = r.span() - sub = sample[self.text_key][span[0]:span[1]] - if self.cpat.search(sub): - # cut it - sample[self.text_key] = sample[ - self.text_key][:span[0]] + sample[self.text_key][span[1]:] - - return sample - - lines = sample[self.text_key].split('\n') - skip = 0 - - # Greedy replace any file that begins with comment block, most - # are copyright headers - for k in range(len(lines)): - if (lines[k].startswith('//') or lines[k].startswith('#') - or lines[k].startswith('--') or not lines[k]): - skip = skip + 1 - else: - break - - if skip: - # we skipped, consume it - sample[self.text_key] = '\n'.join(lines[skip:]) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_email_mapper.html b/_modules/data_juicer/ops/mapper/clean_email_mapper.html deleted file mode 100644 index c76997818..000000000 --- a/_modules/data_juicer/ops/mapper/clean_email_mapper.html +++ /dev/null @@ -1,130 +0,0 @@ - - - - - - data_juicer.ops.mapper.clean_email_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_email_mapper

-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_email_mapper') -class CleanEmailMapper(Mapper): - """Mapper to clean email in text samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+' - -
[docs] def process(self, sample): - - if not re.search(self.pattern, sample[self.text_key], flags=re.DOTALL): - return sample - - sample[self.text_key] = re.sub(pattern=self.pattern, - repl=r'', - string=sample[self.text_key], - flags=re.DOTALL) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_html_mapper.html b/_modules/data_juicer/ops/mapper/clean_html_mapper.html deleted file mode 100644 index f8d6ad18e..000000000 --- a/_modules/data_juicer/ops/mapper/clean_html_mapper.html +++ /dev/null @@ -1,135 +0,0 @@ - - - - - - data_juicer.ops.mapper.clean_html_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_html_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
-# --------------------------------------------------------
-
-from selectolax.parser import HTMLParser
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_html_mapper') -class CleanHtmlMapper(Mapper): - """Mapper to clean html code in text samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - -
[docs] def process(self, sample): - - def _clean_html(raw_html): - raw_html = raw_html.replace('<li>', '\n*') - raw_html = raw_html.replace('</li>', '') - raw_html = raw_html.replace('<ol>', '\n*') - raw_html = raw_html.replace('</ol>', '') - parser = HTMLParser(raw_html) - return parser.text() - - sample[self.text_key] = _clean_html(sample[self.text_key]) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html b/_modules/data_juicer/ops/mapper/clean_ip_mapper.html deleted file mode 100644 index 7cf1f134c..000000000 --- a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html +++ /dev/null @@ -1,135 +0,0 @@ - - - - - - data_juicer.ops.mapper.clean_ip_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_ip_mapper

-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_ip_mapper') -class CleanIpMapper(Mapper): - """Mapper to clean ipv4 and ipv6 address in text samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - - super().__init__(*args, **kwargs) - self.pattern = r'(?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|' - self.pattern += r'(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))' - self.pattern += r'{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|' - self.pattern += r'(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|' - self.pattern += r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}' # ipv6 - -
[docs] def process(self, sample): - - if not re.search(self.pattern, sample[self.text_key], flags=re.DOTALL): - return sample - - sample[self.text_key] = re.sub(pattern=self.pattern, - repl=r'', - string=sample[self.text_key], - flags=re.DOTALL) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_links_mapper.html b/_modules/data_juicer/ops/mapper/clean_links_mapper.html deleted file mode 100644 index dd49acc3b..000000000 --- a/_modules/data_juicer/ops/mapper/clean_links_mapper.html +++ /dev/null @@ -1,140 +0,0 @@ - - - - - - data_juicer.ops.mapper.clean_links_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_links_mapper

-# Some code here has been modified from:
-# https://github.com/kallewesterling/CleanText/
-# --------------------------------------------------------
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_links_mapper') -class CleanLinksMapper(Mapper): - """Mapper to clean links like http/https/ftp in text samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pattern = r'(?i)\b(' - self.pattern += r'(?:[a-z][\w-]+:(?:\/{1,3}|' - self.pattern += r'[a-z0-9%])|www\d{0,3}[.]|' - self.pattern += r'[a-z0-9.\-]+[.][a-z]{2,4}\/)' - self.pattern += r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))' - self.pattern += r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' - self.pattern += r'[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])' - self.pattern += r')' - -
[docs] def process(self, sample): - - if not re.search(self.pattern, sample[self.text_key], flags=re.DOTALL): - return sample - - sample[self.text_key] = re.sub(pattern=self.pattern, - repl=r'', - string=sample[self.text_key], - flags=re.DOTALL) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html b/_modules/data_juicer/ops/mapper/expand_macro_mapper.html deleted file mode 100644 index 2bffdd6b6..000000000 --- a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - data_juicer.ops.mapper.expand_macro_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.expand_macro_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/blob/main/data_prep/arxiv/arxiv_cleaner.py
-# --------------------------------------------------------
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('expand_macro_mapper') -class ExpandMacroMapper(Mapper): - """Mapper to expand macro definitions in the document body of Latex - samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - - def _build_non_arg_macros_dict(self, file_content): - # regex for extracting \newcommand macros without arguments - non_arg_nc_reg = re.compile( - # this regex matches the following: - # \newcommand{\macro_name}{macro_value} - # \newcommand*{\macro_name}{macro_value} - # where macro_name is only allowed to contain letters and numbers; - # macro_value can contain any character. - pattern=r'\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$', - flags=re.MULTILINE) - - # regex for extracting \def macros without arguments - non_arg_def_reg = re.compile( - # this regex matches the following: - # \def\macro_name{macro_value} - # where macro_name is only allowed to contain letters and numbers; - # macro_value can contain any character. - pattern=r'\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$', - flags=re.MULTILINE) - - # Extract all user-defined LaTeX macros from the preamble - macros = {} - for reg in [non_arg_nc_reg, non_arg_def_reg]: - for match in reg.finditer(file_content): - # convert the macro name and value to a raw string that can be - # used in re.sub - macro_name = match.group(1).encode('unicode-escape').decode( - 'utf-8') - macro_val = match.group(2).encode('unicode-escape').decode( - 'utf-8') - - macros[macro_name] = macro_val - return macros - -
[docs] def process(self, sample): - non_arg_macros = self._build_non_arg_macros_dict(sample[self.text_key]) - - # TODO: macros that take arguments are not supported yet - arg_macros = {} - - # inline-expand all non-arg macros - for macro_name, macro_value in non_arg_macros.items(): - sample[self.text_key] = re.sub( - # make pattern grouped to make sure that the macro is not part - # of a longer alphanumeric word - pattern=r'(' + macro_name + r')' + r'([^a-zA-Z0-9])', - # replace the macro with its value and add back the character - # that was matched after the macro - repl=macro_value + r'\2', - string=sample[self.text_key]) - - # inline-expand all macros that use args - # TODO: inline-expand macros with args - for macro_name, macro_value in arg_macros.items(): - pass - - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html b/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html deleted file mode 100644 index af8339dbf..000000000 --- a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html +++ /dev/null @@ -1,122 +0,0 @@ - - - - - - data_juicer.ops.mapper.fix_unicode_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.fix_unicode_mapper

-import ftfy
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('fix_unicode_mapper') -class FixUnicodeMapper(Mapper): - """Mapper to fix unicode errors in text samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - -
[docs] def process(self, sample): - sample[self.text_key] = ftfy.fix_text(sample[self.text_key]) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html b/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html deleted file mode 100644 index 39d4f1fe5..000000000 --- a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html +++ /dev/null @@ -1,163 +0,0 @@ - - - - - - data_juicer.ops.mapper.punctuation_normalization_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.punctuation_normalization_mapper

-# Some code here has been modified from:
-# https://github.com/bigscience-workshop/data-preparation
-# --------------------------------------------------------
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('punctuation_normalization_mapper') -class PunctuationNormalizationMapper(Mapper): - """Mapper to normalize unicode punctuations to English punctuations in text - \ samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.punctuation_unicode = { - ',': ',', - '。': '.', - '、': ',', - '„': '"', - '”': '"', - '“': '"', - '«': '"', - '»': '"', - '1': '"', - '」': '"', - '「': '"', - '《': '"', - '》': '"', - '´': "'", - '∶': ':', - ':': ':', - '?': '?', - '!': '!', - '(': '(', - ')': ')', - ';': ';', - '–': '-', - '—': ' - ', - '.': '. ', - '~': '~', - '’': "'", - '…': '...', - '━': '-', - '〈': '<', - '〉': '>', - '【': '[', - '】': ']', - '%': '%', - '►': '-', - } - -
[docs] def process(self, sample): - sample[self.text_key] = ''.join([ - self.punctuation_unicode.get(c, c) for c in sample[self.text_key] - ]) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html b/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html deleted file mode 100644 index fabcea27f..000000000 --- a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html +++ /dev/null @@ -1,136 +0,0 @@ - - - - - - data_juicer.ops.mapper.remove_bibliography_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_bibliography_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
-# --------------------------------------------------------
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_bibliography_mapper') -class RemoveBibliographyMapper(Mapper): - """Mapper to remove bibliography at the end of documents in Latex - samples.""" - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pattern = r'(\\appendix|' - self.pattern += r'\\begin\{references\}|' - self.pattern += r'\\begin\{REFERENCES\}|' - self.pattern += r'\\begin\{thebibliography\}|' - self.pattern += r'\\bibliography\{.*\}' - self.pattern += r').*$' - -
[docs] def process(self, sample): - sample[self.text_key] = re.sub(pattern=self.pattern, - repl=r'', - string=sample[self.text_key], - flags=re.DOTALL) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html b/_modules/data_juicer/ops/mapper/remove_comments_mapper.html deleted file mode 100644 index 9f16e8009..000000000 --- a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html +++ /dev/null @@ -1,156 +0,0 @@ - - - - - - data_juicer.ops.mapper.remove_comments_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_comments_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
-# --------------------------------------------------------
-
-from typing import List, Union
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_comments_mapper') -class RemoveCommentsMapper(Mapper): - """ - Mapper to remove comments in different kinds of documents. - - Only support 'tex' \ for now. - """ - - def __init__(self, - doc_type: Union[str, List[str]] = 'tex', - inline: bool = True, - multiline: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param doc_type: Type of document to remove comments. - :param inline: Whether to remove inline comments. - :param multiline: Whether to remove multiline comments. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.doc_type = doc_type - self.inline = inline - self.multiline = multiline - -
[docs] def process(self, sample): - # TODO: remove different comments by sample type - - if self.inline: - # remove all in comments within a line - sample[self.text_key] = re.sub(pattern=r'[^\\]%.+$', - repl=r'', - string=sample[self.text_key], - flags=re.MULTILINE) - - if self.multiline: - sample[self.text_key] = re.sub(pattern=r'(?m)^%.*\n?', - repl=r'', - string=sample[self.text_key], - flags=re.MULTILINE) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_header_mapper.html b/_modules/data_juicer/ops/mapper/remove_header_mapper.html deleted file mode 100644 index 7f1fa1383..000000000 --- a/_modules/data_juicer/ops/mapper/remove_header_mapper.html +++ /dev/null @@ -1,150 +0,0 @@ - - - - - - data_juicer.ops.mapper.remove_header_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_header_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
-# --------------------------------------------------------
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-# TODO
-
[docs]@OPERATORS.register_module('remove_header_mapper') -class RemoveHeaderMapper(Mapper): - """Mapper to remove headers at the beginning of documents in Latex - samples.""" - - def __init__(self, drop_no_head: bool = True, *args, **kwargs): - """ - Initialization method. - - :param drop_no_head: whether to drop sample texts without - headers. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pattern = r'^(.*?)(' - self.pattern += r'\\\bchapter\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bpart\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bsubsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}' - self.pattern += r'\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}' - self.pattern += r')' - - self.drop_no_head = drop_no_head - -
[docs] def process(self, sample): - - if not re.search(self.pattern, sample[self.text_key], flags=re.DOTALL): - if self.drop_no_head: - sample[self.text_key] = '' - return sample - - sample[self.text_key] = re.sub(pattern=self.pattern, - repl=r'\2', - string=sample[self.text_key], - flags=re.DOTALL) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html b/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html deleted file mode 100644 index ff560bf21..000000000 --- a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html +++ /dev/null @@ -1,154 +0,0 @@ - - - - - - data_juicer.ops.mapper.remove_long_words_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_long_words_mapper

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-import sys
-
-from jsonargparse.typing import PositiveInt
-
-from ..base_op import OPERATORS, Mapper
-from ..common import (SPECIAL_CHARACTERS, merge_on_whitespace_tab_newline,
-                      split_on_newline_tab_whitespace, strip)
-
-
-
[docs]@OPERATORS.register_module('remove_long_words_mapper') -class RemoveLongWordsMapper(Mapper): - """Mapper to remove long words within a specific range.""" - - def __init__(self, - min_len: PositiveInt = 1, - max_len: PositiveInt = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param min_len: The min mapper word length in this op, words - will be filtered if their length is below this parameter. - :param max_len: The max mapper word length in this op, words - will be filtered if their length exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_len = min_len - self.max_len = max_len - -
[docs] def should_keep_long_word(self, word): - if self.min_len <= len(word) <= self.max_len: - return True - elif self.min_len <= len(strip(word, - SPECIAL_CHARACTERS)) <= self.max_len: - return True - else: - return False
- -
[docs] def process(self, sample): - - sentences = split_on_newline_tab_whitespace(sample[self.text_key]) - sentences = [[[ - word for word in subsentence if self.should_keep_long_word(word) - ] for subsentence in sentence] for sentence in sentences] - sample[self.text_key] = merge_on_whitespace_tab_newline(sentences) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html b/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html deleted file mode 100644 index 63c2895db..000000000 --- a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html +++ /dev/null @@ -1,141 +0,0 @@ - - - - - - data_juicer.ops.mapper.remove_specific_chars_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_specific_chars_mapper

-from typing import List, Union
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_specific_chars_mapper') -class RemoveSpecificCharsMapper(Mapper): - """Mapper to clean specific chars in text samples.""" - - def __init__(self, - chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□', - *args, - **kwargs): - """ - Initialization method. - - :param chars_to_remove: a list or a string including all - characters that need to be removed from text. - :param args: extra args - :param kwargs: extra args - """ - - super().__init__(*args, **kwargs) - if chars_to_remove: - self.pattern = '[' + '|'.join(chars_to_remove) + ']' - else: - self.pattern = None - -
[docs] def process(self, sample): - - if self.pattern is None: - return sample - - sample[self.text_key] = re.sub(pattern=self.pattern, - repl=r'', - string=sample[self.text_key], - flags=re.DOTALL) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html b/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html deleted file mode 100644 index e8d0dd886..000000000 --- a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html +++ /dev/null @@ -1,146 +0,0 @@ - - - - - - data_juicer.ops.mapper.remove_table_text_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_table_text_mapper

-import regex as re
-from jsonargparse.typing import restricted_number_type
-
-from ..base_op import OPERATORS, Mapper
-
-from_2_to_20 = restricted_number_type('from_2_to_20', int, [('>=', 2),
-                                                            ('<=', 20)])
-
-
-
[docs]@OPERATORS.register_module('remove_table_text_mapper') -class RemoveTableTextMapper(Mapper): - """ - Mapper to remove table texts from text samples. - - Regular expression is used to remove tables in the range of column - number of tables. - """ - - def __init__(self, - min_col: from_2_to_20 = 2, - max_col: from_2_to_20 = 20, - *args, - **kwargs): - """ - Initialization method. - - :param min_col: The min number of columns of table to remove. - :param max_col: The max number of columns of table to remove. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_col = min_col - self.max_col = max_col - self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}' - -
[docs] def process(self, sample): - - text = sample[self.text_key] - for i in range(self.min_col - 1, self.max_col): - pattern = re.compile(self.pattern % i) - text = pattern.sub('', text) - - sample[self.text_key] = text - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html b/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html deleted file mode 100644 index 9928a34fd..000000000 --- a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html +++ /dev/null @@ -1,167 +0,0 @@ - - - - - - data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper

-from jsonargparse.typing import List
-
-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ..base_op import OPERATORS, Mapper
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      merge_on_whitespace_tab_newline,
-                      split_on_newline_tab_whitespace, strip)
-
-
-
[docs]@OPERATORS.register_module('remove_words_with_incorrect_substrings_mapper') -class RemoveWordsWithIncorrectSubstringsMapper(Mapper): - """Mapper to remove words with incorrect substrings.""" - - def __init__(self, - lang: str = 'en', - tokenization: bool = False, - substrings: List = None, - *args, - **kwargs): - """ - Initialization method. - - :param lang: sample in which language - :param tokenization: whether to use model to tokenize documents - :param substrings: The incorrect substrings in words. - :param args: extra args - :param kwargs: extra args - """ - if substrings is None: - substrings = ['http', 'www', '.com', 'href', '//'] - super().__init__(*args, **kwargs) - self.tokenization = tokenization - self.substrings = substrings - self.lang = lang - if tokenization: - self.model_key = prepare_model(lang=lang, - model_type='sentencepiece') - -
[docs] def should_keep_word_with_incorrect_substrings(self, word, substrings): - word = strip(word, SPECIAL_CHARACTERS) - should_keep = all([(i_substr not in word) for i_substr in substrings]) - return should_keep
- -
[docs] def process(self, sample): - if self.tokenization: - tokenizer = get_model(self.model_key, lang=self.lang, model_type='sentencepiece') - sentences = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.encode_as_pieces if tokenizer else None) - words = [ - word.replace('▁', '') for word in sentences - if self.should_keep_word_with_incorrect_substrings( - word.replace('▁', ''), self.substrings) - ] - if len(words) != len(sentences): - sample[self.text_key] = ''.join(words) - else: - sentences = split_on_newline_tab_whitespace(sample[self.text_key]) - sentences = [[[ - word for word in subsentence - if self.should_keep_word_with_incorrect_substrings( - word, self.substrings) - ] for subsentence in sentence] for sentence in sentences] - sample[self.text_key] = merge_on_whitespace_tab_newline(sentences) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html b/_modules/data_juicer/ops/mapper/sentence_split_mapper.html deleted file mode 100644 index eb3ebf0c2..000000000 --- a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html +++ /dev/null @@ -1,130 +0,0 @@ - - - - - - data_juicer.ops.mapper.sentence_split_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.sentence_split_mapper

-from data_juicer.utils.model_utils import prepare_model, get_model
-
-from ..base_op import OPERATORS, Mapper
-from ..common import get_sentences_from_document
-
-
-
[docs]@OPERATORS.register_module('sentence_split_mapper') -class SentenceSplitMapper(Mapper): - """Mapper to split text samples to sentences.""" - - def __init__(self, lang: str = 'en', *args, **kwargs): - """ - Initialization method. - - :param lang: split sentence of text in which language. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.lang = lang - self.model_key = prepare_model(lang=lang, model_type='nltk') - -
[docs] def process(self, sample): - - nltk_model = get_model(self.model_key, lang=self.lang, model_type='nltk') - sample[self.text_key] = get_sentences_from_document( - sample[self.text_key], - model_func=nltk_model.tokenize if nltk_model else None) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html b/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html deleted file mode 100644 index d1999d5e4..000000000 --- a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html +++ /dev/null @@ -1,137 +0,0 @@ - - - - - - data_juicer.ops.mapper.whitespace_normalization_mapper — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

-# Most of the code here has been modified from:
-# https://github.com/bigscience-workshop/data-preparation
-# --------------------------------------------------------
-
-from ..base_op import OPERATORS, Mapper
-from ..common.special_characters import VARIOUS_WHITESPACES
-
-
[docs]@OPERATORS.register_module('whitespace_normalization_mapper') -class WhitespaceNormalizationMapper(Mapper): - """ - Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20) - in text samples. - - Different kinds of whitespaces can be found here: - https://en.wikipedia.org/wiki/Whitespace_character - """ - - def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - -
[docs] def process(self, sample): - # remove whitespaces before and after the main content - text = sample[self.text_key].strip() - - # replace all kinds of whitespaces with ' ' - sample[self.text_key] = ''.join( - [char if char not in VARIOUS_WHITESPACES else ' ' - for char in text]) - - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html b/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html deleted file mode 100644 index b0d85ef19..000000000 --- a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - data_juicer.ops.selector.frequency_specified_field_selector — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.selector.frequency_specified_field_selector

-import numbers
-
-from jsonargparse.typing import ClosedUnitInterval, PositiveInt
-
-from ..base_op import OPERATORS, Selector
-
-
-
[docs]@OPERATORS.register_module('frequency_specified_field_selector') -class FrequencySpecifiedFieldSelector(Selector): - """Selector to select samples based on the sorted frequency of specified - field.""" - - def __init__(self, - field_key: str = '', - top_ratio: ClosedUnitInterval = None, - topk: PositiveInt = None, - reverse: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Selector based on the specified value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param top_ratio: Ratio of selected top specified field value, - samples will be selected if their specified field values are - within this parameter. When both topk and top_ratio are set, - the value corresponding to the smaller number of samples - will be applied. - :param topk: Number of selected top specified field value, - samples will be selected if their specified field values are - within this parameter. When both topk and top_ratio are set, - the value corresponding to the smaller number of samples - will be applied. - :param reverse: Determine the sorting rule, if reverse=True, - then sort in descending order. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.top_ratio = top_ratio - self.topk = topk - self.reverse = reverse - -
[docs] def process(self, dataset): - if len(dataset) <= 1 or not self.field_key: - return dataset - - field_keys = self.field_key.split('.') - assert field_keys[0] in dataset.features.keys( - ), "'{}' not in {}".format(field_keys[0], dataset.features.keys()) - - field_value_dict = {} - for i, item in enumerate(dataset[field_keys[0]]): - field_value = item - for key in field_keys[1:]: - assert key in field_value.keys(), "'{}' not in {}".format( - key, field_value.keys()) - field_value = field_value[key] - assert field_value is None or isinstance( - field_value, str) or isinstance( - field_value, numbers.Number - ), 'The {} item is not String, Numbers or NoneType'.format(i) - if field_value not in field_value_dict.keys(): - field_value_dict[field_value] = [i] - else: - field_value_dict[field_value].append(i) - - select_num = 0 - if not self.top_ratio: - if not self.topk: - return dataset - else: - select_num = self.topk - else: - select_num = self.top_ratio * len(field_value_dict) - if self.topk and self.topk < select_num: - select_num = self.topk - - select_index = sum( - sorted(field_value_dict.values(), - key=lambda x: len(x), - reverse=self.reverse)[:int(select_num)], []) - return dataset.select(select_index)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html deleted file mode 100644 index 155754b0d..000000000 --- a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html +++ /dev/null @@ -1,198 +0,0 @@ - - - - - - data_juicer.ops.selector.topk_specified_field_selector — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.ops.selector.topk_specified_field_selector

-import heapq
-import sys
-
-from jsonargparse.typing import ClosedUnitInterval, PositiveInt
-
-from ..base_op import OPERATORS, Selector
-
-
-
[docs]def to_number(s, reverse=True): - try: - return float(s) - except Exception: - if reverse: - return -sys.maxsize - else: - return sys.maxsize
- - -
[docs]@OPERATORS.register_module('topk_specified_field_selector') -class TopkSpecifiedFieldSelector(Selector): - """Selector to select top samples based on the sorted specified field - value.""" - - def __init__(self, - field_key: str = '', - top_ratio: ClosedUnitInterval = None, - topk: PositiveInt = None, - reverse: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Selector based on the specified value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param top_ratio: Ratio of selected top samples, samples will be - selected if their specified field values are within this - parameter. When both topk and top_ratio are set, the value - corresponding to the smaller number of samples will be - applied. - :param topk: Number of selected top sample, samples will be - selected if their specified field values are within this - parameter. When both topk and top_ratio are set, the value - corresponding to the smaller number of samples will be - applied. - :param reverse: Determine the sorting rule, if reverse=True, - then sort in descending order. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.top_ratio = top_ratio - self.topk = topk - self.reverse = reverse - -
[docs] def process(self, dataset): - if len(dataset) <= 1 or not self.field_key: - return dataset - - select_num = 0 - if not self.top_ratio: - if not self.topk: - return dataset - else: - select_num = self.topk - else: - select_num = self.top_ratio * len(dataset) - if self.topk and self.topk < select_num: - select_num = self.topk - - field_keys = self.field_key.split('.') - assert field_keys[0] in dataset.features.keys( - ), "'{}' not in {}".format(field_keys[0], dataset.features.keys()) - - if len(field_keys) == 1: - field_value_list = dataset[field_keys[0]] - else: - field_value_list = [] - for item in dataset[field_keys[0]]: - field_value = item - for key in field_keys[1:]: - assert key in field_value.keys(), "'{}' not in {}".format( - key, field_value.keys()) - field_value = field_value[key] - field_value_list.append(to_number(field_value, self.reverse)) - - if self.reverse: - select_index = heapq.nlargest(int(select_num), range(len(dataset)), - field_value_list.__getitem__) - else: - select_index = heapq.nsmallest(int(select_num), - range(len(dataset)), - field_value_list.__getitem__) - return dataset.select(select_index)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/utils/asset_utils.html b/_modules/data_juicer/utils/asset_utils.html deleted file mode 100644 index 21ae2ea46..000000000 --- a/_modules/data_juicer/utils/asset_utils.html +++ /dev/null @@ -1,159 +0,0 @@ - - - - - - data_juicer.utils.asset_utils — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.utils.asset_utils

-import json
-import os
-
-import requests
-from loguru import logger
-
-from .cache_utils import DATA_JUICER_ASSETS_CACHE
-
-# Default directory to store auxiliary resources
-ASSET_DIR = DATA_JUICER_ASSETS_CACHE
-
-# Default cached assets links for downloading
-ASSET_LINKS = {
-    'flagged_words':
-    'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
-    'data_juicer/flagged_words.json',
-    'stopwords':
-    'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
-    'data_juicer/stopwords.json',
-}
-
-
-
[docs]def load_words_asset(words_dir: str, words_type: str): - """ - Load words from a asset file named `words_type`, if not find a valid asset - file, then download it from ASSET_LINKS cached by data_juicer team. - - :param words_dir: directory that stores asset file(s) - :param words_type: name of target words assets - :return: a dict that stores words assets, whose keys are language - names, and the values are lists of words - """ - words_dict = {} - os.makedirs(words_dir, exist_ok=True) - - # try to load words from `words_type` file - for filename in os.listdir(words_dir): - if filename.endswith('.json') and words_type in filename: - with open(os.path.join(words_dir, filename), 'r') as file: - loaded_words = json.load(file) - for key in loaded_words: - if key in words_dict: - words_dict[key] += loaded_words[key] - else: - words_dict[key] = loaded_words[key] - # if the asset file is not found, then download it from ASSET_LINKS - if not bool(words_dict): - logger.info(f'Specified {words_dir} does not contain ' - f'any {words_type} files in json format, now ' - 'download the one cached by data_juicer team') - response = requests.get(ASSET_LINKS[words_type]) - words_dict = response.json() - # cache the asset file locally - cache_path = os.path.join(words_dir, f'{words_type}.json') - with open(cache_path, 'w') as file: - json.dump(words_dict, file) - - return words_dict
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/utils/ckpt_utils.html b/_modules/data_juicer/utils/ckpt_utils.html deleted file mode 100644 index 62bb44ec9..000000000 --- a/_modules/data_juicer/utils/ckpt_utils.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - data_juicer.utils.ckpt_utils — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.utils.ckpt_utils

-import json
-import os
-
-from datasets import Dataset
-from loguru import logger
-
-
-
[docs]class CheckpointManager: - """ - This class is used to save the latest version of dataset to checkpoint - directory or load it from checkpoint directory, a bit like cache management - Rerun the same config will reload the checkpoint and skip ops before it. - - If any args of operator in process list is changed, all ops will be - rerun from the beginning. - """ - - def __init__(self, ckpt_dir, original_process_list, num_proc=1): - """ - Initialization method. - - :param ckpt_dir: path to save and load checkpoint - :param original_process_list: process list in config - :param num_proc: number of process workers when saving dataset - """ - self.ckpt_dir = ckpt_dir - self.ckpt_ds_dir = os.path.join(self.ckpt_dir, 'latest') - self.ckpt_op_record = os.path.join(self.ckpt_dir, 'ckpt_op.json') - self.process_list = original_process_list - self.num_proc = num_proc - self.op_record = [] - - self.ckpt_available = self.check_ckpt() - -
[docs] def get_left_process_list(self): - """ - Get left process list of ops for processing dataset, when checkpoint is - available, remove some ops from process list, otherwise keep it - unchanged. - - :return: process list of left ops - """ - return self.process_list
- -
[docs] def check_ckpt(self): - """ - Check if checkpoint is available. - - :return: True when checkpoint is available, else False - """ - if os.path.exists(self.ckpt_ds_dir) \ - and os.path.isdir(self.ckpt_ds_dir) \ - and os.path.exists(self.ckpt_op_record) \ - and os.path.isfile(self.ckpt_op_record) \ - and self.check_ops_to_skip(): - return True - else: - os.makedirs(self.ckpt_dir, exist_ok=True) - return False
- -
[docs] def record(self, op_name, op_args): - """Save op name and args to op record, which is used to compare with - the process list from config to decide if a checkpoint is available.""" - self.op_record.append({op_name: op_args})
- -
[docs] def check_ops_to_skip(self): - """ - Check which ops need to be skipped in the process list. - - If op record list from checkpoint are the same as the prefix - part of process list, then skip these ops and start processing - from the checkpoint. Otherwise, process the original dataset - from scratch. - - :return: whether to skip somme ops or not - """ - - # load op records - with open(self.ckpt_op_record, 'r') as fin: - self.op_record = json.load(fin) - - # check whether the op records are exactly the same - # with prefix of process list - # 1. same: remove these ops from process list - # 2. different: cleanup op record, and keep process list unchanged - recorded_op_num = len(self.op_record) - prefix_process = self.process_list[:recorded_op_num] - all_the_same = True - dif1, dif2 = None, None - - for record_op, config_op in zip(self.op_record, prefix_process): - if record_op != config_op: - all_the_same = False - dif1, dif2 = record_op, config_op - break - if all_the_same: - for op in self.op_record: - op_name = list(op.keys())[0] - logger.info(f'Skip op [{op_name}].') - self.process_list = self.process_list[recorded_op_num:] - return True - else: - logger.warning(f'Processed ops of checkpoint are different from ' - f'current configs: checkpoint-{dif1} vs. config-' - f'{dif2}. All ops will be processed from the ' - f'beginning') - self.op_record = [] - return False
- -
[docs] def save_ckpt(self, ds): - """ - Save dataset to checkpoint directory and dump processed ops list. - - :param ds: input dataset to save - """ - ds.save_to_disk(self.ckpt_ds_dir, num_proc=self.num_proc) - - with open(self.ckpt_op_record, 'w') as fout: - json.dump(self.op_record, fout)
- -
[docs] def load_ckpt(self): - """ - Load dataset from a checkpoint file. - - :return: a dataset stored in checkpoint file. - """ - ds = Dataset.load_from_disk(self.ckpt_ds_dir) - return ds
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/utils/file_utils.html b/_modules/data_juicer/utils/file_utils.html deleted file mode 100644 index af14e5c6a..000000000 --- a/_modules/data_juicer/utils/file_utils.html +++ /dev/null @@ -1,170 +0,0 @@ - - - - - - data_juicer.utils.file_utils — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.utils.file_utils

-from pathlib import Path
-from typing import List, Tuple, Union
-
-from datasets.utils.extract import ZstdExtractor as Extractor
-
-
-
[docs]def find_files_with_suffix( - path: Union[str, Path], - suffixes: Union[str, List[str], Tuple[str]] = None) -> List[str]: - """ - Traverse a path to find all files with the specified suffixes. - - :param path: path (str/Path): source path - :param suffixes: specified file suffixes, '.txt' or ['.txt', '.md'] - etc - :return: list of all files with the specified suffixes - """ - path = Path(path) - file_dict = {} - - if suffixes is None: - suffixes = [] - - if isinstance(suffixes, str): - suffixes = [suffixes] - - suffixes = [ - x.lower() if x.startswith('.') else '.' + x.lower() for x in suffixes - ] - - if path.is_file(): - files = [path] - else: - searched_files = path.rglob('*') - files = [file for file in searched_files if file.is_file()] - - extractor = Extractor - - # only keep the file with the specified suffixes - for file in files: - suffix = file.suffix.lower() - - if extractor.is_extractable(file): - - # TODO - # hard code - # only support zstd-format file now, - # and use the last 2 sub-suffixes as the final suffix - # just like '.jsonl.zst' - file_suffixes = [suffix.lower() for suffix in file.suffixes] - suffix = ''.join(file_suffixes[-2:]) - - if not suffixes or (suffix in suffixes): - if suffix not in file_dict: - file_dict[suffix] = [str(file)] - else: - file_dict[suffix].append(str(file)) - return file_dict
- - -
[docs]def is_absolute_path(path: Union[str, Path]) -> bool: - """ - Check whether input path is a absolute path. - - :param path: input path - :return: True means input path is absolute path, False means input - path is a relative path. - """ - return Path(path).is_absolute()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/utils/logger_utils.html b/_modules/data_juicer/utils/logger_utils.html deleted file mode 100644 index 726f0102d..000000000 --- a/_modules/data_juicer/utils/logger_utils.html +++ /dev/null @@ -1,253 +0,0 @@ - - - - - - data_juicer.utils.logger_utils — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.utils.logger_utils

-# Some codes here are adapted from
-# https://github.com/MegEngine/YOLOX/blob/main/yolox/utils/logger.py
-
-# Copyright 2021 Megvii, Base Detection
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-import inspect
-import os
-import sys
-
-from loguru import logger
-from loguru._file_sink import FileSink
-
-LOGGER_SETUP = False
-
-
-
[docs]def get_caller_name(depth=0): - """ - Get caller name by depth. - - :param depth: depth of caller context, use 0 for caller depth. - :return: module name of the caller - """ - # the following logic is a little bit faster than inspect.stack() logic - frame = inspect.currentframe().f_back - for _ in range(depth): - frame = frame.f_back - - return frame.f_globals['__name__']
- - -
[docs]class StreamToLoguru: - """Stream object that redirects writes to a logger instance.""" - - def __init__(self, level='INFO', caller_names=('datasets', 'logging')): - """ - Initialization method. - - :param level: log level string of loguru. Default value: "INFO". - :param caller_names: caller names of redirected module. - Default value: (apex, pycocotools). - """ - self.level = level - self.linebuf = '' - self.caller_names = caller_names - -
[docs] def write(self, buf): - full_name = get_caller_name(depth=1) - module_name = full_name.rsplit('.', maxsplit=-1)[0] - if module_name in self.caller_names: - for line in buf.rstrip().splitlines(): - # use caller level log - logger.opt(depth=2).log(self.level, line.rstrip()) - else: - # sys.__stdout__.write(buf) - logger.opt(raw=True).info(buf)
- -
[docs] def flush(self): - pass
- - -
[docs]def redirect_sys_output(log_level='INFO'): - """ - Redirect stdout/stderr to loguru with log level. - - :param log_level: log level string of loguru. Default value: "INFO". - """ - redirect_logger = StreamToLoguru(log_level) - sys.stderr = redirect_logger - sys.stdout = redirect_logger
- - -
[docs]def get_log_file_path(): - """ - Get the path to the location of the log file. - - :return: a location of log file. - """ - for _, handler in logger._core.handlers.items(): - if isinstance(handler._sink, FileSink): - return handler._sink._file.name
- - -
[docs]def setup_logger(save_dir, distributed_rank=0, filename='log.txt', mode='o', redirect=True): - """ - Setup logger for training and testing. - - :param save_dir: location to save log file - :param distributed_rank: device rank when multi-gpu environment - :param filename: log file name to save - :param mode: log file write mode, `append` or `override`. default is `o`. - :param redirect: whether to redirect system output - :return: logger instance. - """ - global LOGGER_SETUP - - if LOGGER_SETUP: - return - - loguru_format = ( - '<green>{time:YYYY-MM-DD HH:mm:ss}</green> | ' - '<level>{level: <8}</level> | ' - '<cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>') - - logger.remove() - save_file = os.path.join(save_dir, filename) - if mode == 'o' and os.path.exists(save_file): - os.remove(save_file) - - # only keep logger in rank0 process - if distributed_rank == 0: - logger.add( - sys.stderr, - format=loguru_format, - level='INFO', - enqueue=True, - ) - logger.add(save_file) - - # redirect stdout/stderr to loguru - if redirect: - redirect_sys_output('INFO') - LOGGER_SETUP = True
- -
[docs]class HiddenPrints: - """Define a range that hide the outputs within this range.""" - def __enter__(self): - """ - Store the original standard output and redirect the standard output to - null when entering this range. - """ - self._original_stdout = sys.stdout - sys.stdout = open(os.devnull, 'w') - - def __exit__(self, exc_type, exc_val, exc_tb): - """ - Close the redirected standard output and restore it when exiting from - this range. - """ - sys.stdout.close() - sys.stdout = self._original_stdout
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/utils/model_utils.html b/_modules/data_juicer/utils/model_utils.html deleted file mode 100644 index 56b09db10..000000000 --- a/_modules/data_juicer/utils/model_utils.html +++ /dev/null @@ -1,352 +0,0 @@ - - - - - - data_juicer.utils.model_utils — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - - -
  • -
  • -
-
-
-
-
- -

Source code for data_juicer.utils.model_utils

-import os
-
-import wget
-from loguru import logger
-
-from .cache_utils import DATA_JUICER_MODELS_CACHE
-
-# Default directory to store models
-MODEL_PATH = DATA_JUICER_MODELS_CACHE
-
-# Default backup cached models links for downloading
-BACKUP_MODEL_LINKS = {
-    # language identification model from fasttext
-    'lid.176.bin':
-    'https://dl.fbaipublicfiles.com/fasttext/supervised-models/',
-
-    # tokenizer and language model for English from sentencepiece and KenLM
-    '%s.sp.model':
-    'https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/',
-    '%s.arpa.bin':
-    'https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/',
-
-    # sentence split model from nltk punkt
-    'punkt.%s.pickle':
-    'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
-    'data_juicer/models/'
-}
-
-# Default cached models links for downloading
-MODEL_LINKS = 'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/' \
-               'data_juicer/models/'
-
-MODEL_ZOO = {}
-
-
-
[docs]def check_model(model_name, args=(), force=False): - """ - Check whether a model exists in MODEL_PATH. If exists, return its full path - Else, download it from cached models links. - - :param model_name: a specified model name - :param args: optional extra args of model. - :param force: Whether to download model forcefully or not, Sometimes - the model file maybe incomplete for some reason, so need to - download again forcefully. - """ - if not os.path.exists(MODEL_PATH): - os.makedirs(MODEL_PATH) - - # check if the specified model exists. If it does not exist, download it - true_model_name = model_name % args - mdp = os.path.join(MODEL_PATH, true_model_name) - if force: - if os.path.exists(mdp): - os.remove(mdp) - logger.info( - f'Model [{true_model_name}] invalid, force to downloading...') - else: - logger.info( - f'Model [{true_model_name}] not found . Downloading...') - - try: - model_link = os.path.join(MODEL_LINKS, true_model_name) - wget.download(model_link, mdp, bar=None) - except: # noqa: E722 - try: - backup_model_link = os.path.join( - BACKUP_MODEL_LINKS[model_name], true_model_name) - wget.download(backup_model_link, mdp, bar=None) - except: # noqa: E722 - logger.error( - f'Downloading model [{true_model_name}] error. ' - f'Please retry later or download it into {MODEL_PATH} ' - f'manually from {model_link} or {backup_model_link} ') - exit(1) - return mdp
- - -
[docs]def prepare_fasttext_model(model_name): - """ - Prepare and load a fasttext model. - - :param model_name: input model name - :return: model instance. - """ - import fasttext - logger.info('Loading fasttext language identification model...') - try: - ft_model = fasttext.load_model(check_model(model_name)) - except: # noqa: E722 - ft_model = fasttext.load_model(check_model(model_name, force=True)) - return ft_model
- - -
[docs]def prepare_sentencepiece_model(model_name, lang): - """ - Prepare and load a sentencepiece model. - - :param model_name: input model name in formatting syntax - :param lang: language to render model name - :return: model instance. - """ - import sentencepiece - logger.info('Loading sentencepiece model...') - sentencepiece_model = sentencepiece.SentencePieceProcessor() - try: - sentencepiece_model.load(check_model(model_name, lang)) - except: # noqa: E722 - sentencepiece_model.load(check_model(model_name, lang, force=True)) - return sentencepiece_model
- - -
[docs]def prepare_kenlm_model(model_name, lang): - """ - Prepare and load a kenlm model. - - :param model_name: input model name in formatting syntax. - :param lang: language to render model name - :return: model instance. - """ - import kenlm - logger.info('Loading kenlm language model...') - try: - kenlm_model = kenlm.Model(check_model(model_name, lang)) - except: # noqa: E722 - kenlm_model = kenlm.Model(check_model(model_name, lang, force=True)) - return kenlm_model
- - -
[docs]def prepare_nltk_model(model_name, lang): - """ - Prepare and load a nltk punkt model. - - :param model_name: input model name in formatting syntax - :param lang: language to render model name - :return: model instance. - """ - - nltk_to_punkt = { - 'en': 'english', - 'fr': 'french', - 'pt': 'portuguese', - 'es': 'spanish' - } - assert lang in nltk_to_punkt.keys( - ), 'lang must be one of the following: {}'.format( - list(nltk_to_punkt.keys())) - - from nltk.data import load - logger.info('Loading nltk punkt split model...') - try: - nltk_model = load(check_model(model_name, nltk_to_punkt[lang])) - except: # noqa: E722 - nltk_model = load( - check_model(model_name, nltk_to_punkt[lang], force=True)) - return nltk_model
- - -
[docs]def prepare_huggingface_tokenizer(tokenizer_name): - """ - Prepare and load a tokenizer from HuggingFace. - - :param tokenizer_name: input tokenizer name - :return: a tokenizer instance. - """ - from transformers import AutoTokenizer - logger.info('Loading tokenizer from HuggingFace...') - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, - trust_remote_code=True) - return tokenizer
- -
[docs]def prepare_diversity_model(model_name, lang): - """ - Prepare diversity model for specific language. - - :param model_name: the model name to be loaded. - :param lang: language of diversity model. Should be one of ["zh", - "en"] - :return: corresponding diversity model - """ - import spacy - assert lang in ['zh', 'en'], 'Diversity only support zh and en' - model_name = model_name % lang - logger.info(f'Loading spacy model [{model_name}]...') - compressed_model = '%s.zip' % model_name - - # decompress the compressed model if it's not decompressed - def decompress_model(compressed_model_path): - decompressed_model_path = compressed_model_path.replace('.zip', '') - if os.path.exists(decompressed_model_path) \ - and os.path.isdir(decompressed_model_path): - return decompressed_model_path - import zipfile - with zipfile.ZipFile(compressed_model_path) as zf: - zf.extractall(MODEL_PATH) - return decompressed_model_path - - try: - diversity_model = spacy.load( - decompress_model(check_model(compressed_model))) - except: # noqa: E722 - diversity_model = spacy.load( - decompress_model(check_model(compressed_model, force=True))) - return diversity_model
- - -
[docs]def prepare_model(lang='en', model_type='sentencepiece', model_key=None): - """ - Prepare and load a model or a tokenizer from MODEL_ZOO. - - :param lang: which lang model to load - :param model_type: model or tokenizer type - :param model_key: tokenizer name, only used when prepare HuggingFace - tokenizer - :return: a model or tokenizer instance - """ - - type_to_name = { - 'fasttext': ('lid.176.bin', prepare_fasttext_model), - 'sentencepiece': ('%s.sp.model', prepare_sentencepiece_model), - 'kenlm': ('%s.arpa.bin', prepare_kenlm_model), - 'nltk': ('punkt.%s.pickle', prepare_nltk_model), - 'huggingface': ('%s', prepare_huggingface_tokenizer), - 'spacy': ('%s_core_web_md-3.5.0', prepare_diversity_model), - } - assert model_type in type_to_name.keys( - ), 'model_type must be one of the following: {}'.format( - list(type_to_name.keys())) - - if model_key is None: - model_key = model_type + '_' + lang - if model_key not in MODEL_ZOO.keys(): - model_name, model_func = type_to_name[model_type] - if model_type == 'fasttext': - MODEL_ZOO[model_key] = model_func(model_name) - elif model_type == 'huggingface': - MODEL_ZOO[model_key] = model_func(model_key) - else: - MODEL_ZOO[model_key] = model_func(model_name, lang) - return model_key
- - -
[docs]def get_model(model_key, lang='en', model_type='sentencepiece'): - """ - Get a model or a tokenizer from MODEL_ZOO. - - :param model_key: name of the model or tokenzier - """ - if model_key not in MODEL_ZOO: - prepare_model(lang=lang, model_type=model_type, model_key=model_key) - return MODEL_ZOO.get(model_key, None)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/utils/registry.html b/_modules/data_juicer/utils/registry.html deleted file mode 100644 index 0830c6f22..000000000 --- a/_modules/data_juicer/utils/registry.html +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - data_juicer.utils.registry — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.utils.registry

-# Copyright (c) Alibaba, Inc. and its affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# --------------------------------------------------------
-# Most of the code here has been modified from:
-#  https://github.com/modelscope/modelscope/blob/master/modelscope/utils/registry.py
-# --------------------------------------------------------
-
-from loguru import logger
-
-
-
[docs]class Registry(object): - """This class is used to register some modules to registry by a repo - name.""" - - def __init__(self, name: str): - """ - Initialization method. - - :param name: a registry repo name - """ - self._name = name - self._modules = {} - - @property - def name(self): - """ - Get name of current registry. - - :return: name of current registry. - """ - return self._name - - @property - def modules(self): - """ - Get all modules in current registry. - - :return: a dict storing modules in current registry. - """ - return self._modules - -
[docs] def list(self): - """Logging the list of module in current registry.""" - for m in self._modules.keys(): - logger.info(f'{self._name}\t{m}')
- -
[docs] def get(self, module_key): - """ - Get module named module_key from in current registry. If not found, - return None. - - :param module_key: specified module name - :return: module named module_key - """ - return self._modules.get(module_key, None)
- - def _register_module(self, module_name=None, module_cls=None, force=False): - """ - Register module to registry. - - :param module_name: module name - :param module_cls: module class object - :param force: Whether to override an existing class with the - same name. Default: False. - """ - - if module_name is None: - module_name = module_cls.__name__ - - if module_name in self._modules and not force: - raise KeyError( - f'{module_name} is already registered in {self._name}') - - self._modules[module_name] = module_cls - -
[docs] def register_module(self, - module_name: str = None, - module_cls: type = None, - force=False): - """ - Register module class object to registry with the specified modulename. - - :param module_name: module name - :param module_cls: module class object - :param force: Whether to override an existing class with - the same name. Default: False. - - Example: - >>> registry = Registry() - >>> @registry.register_module() - >>> class TextFormatter: - >>> pass - - >>> class TextFormatter2: - >>> pass - >>> registry.register_module( module_name='text_formatter2', - module_cls=TextFormatter2) - """ - if not (module_name is None or isinstance(module_name, str)): - raise TypeError(f'module_name must be either of None, str,' - f'got {type(module_name)}') - if module_cls is not None: - self._register_module(module_name=module_name, - module_cls=module_cls, - force=force) - return module_cls - - # if module_cls is None, should return a decorator function - def _register(module_cls): - """ - Register module class object to registry. - - :param module_cls: module class object - :return: module class object. - """ - self._register_module(module_name=module_name, - module_cls=module_cls, - force=force) - return module_cls - - return _register
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html deleted file mode 100644 index 8c8794279..000000000 --- a/_modules/index.html +++ /dev/null @@ -1,163 +0,0 @@ - - - - - - Overview: module code — data_juicer 0.1.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
-
    -
  • - -
  • -
  • -
-
-
-
-
- -

All modules for which code is available

- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_sources/data_juicer.analysis.rst.txt b/_sources/data_juicer.analysis.rst.txt deleted file mode 100644 index e8a6c97a7..000000000 --- a/_sources/data_juicer.analysis.rst.txt +++ /dev/null @@ -1,37 +0,0 @@ -data\_juicer.analysis package -============================= - -Submodules ----------- - -data\_juicer.analysis.column\_wise\_analysis module ---------------------------------------------------- - -.. automodule:: data_juicer.analysis.column_wise_analysis - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.analysis.diversity\_analysis module ------------------------------------------------- - -.. automodule:: data_juicer.analysis.diversity_analysis - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.analysis.overall\_analysis module ----------------------------------------------- - -.. automodule:: data_juicer.analysis.overall_analysis - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.analysis - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.config.rst.txt b/_sources/data_juicer.config.rst.txt deleted file mode 100644 index 9b7293596..000000000 --- a/_sources/data_juicer.config.rst.txt +++ /dev/null @@ -1,21 +0,0 @@ -data\_juicer.config package -=========================== - -Submodules ----------- - -data\_juicer.config.config module ---------------------------------- - -.. automodule:: data_juicer.config.config - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.config - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.core.rst.txt b/_sources/data_juicer.core.rst.txt deleted file mode 100644 index 858d271ca..000000000 --- a/_sources/data_juicer.core.rst.txt +++ /dev/null @@ -1,53 +0,0 @@ -data\_juicer.core package -========================= - -Submodules ----------- - -data\_juicer.core.analyser module ---------------------------------- - -.. automodule:: data_juicer.core.analyser - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.data module ------------------------------ - -.. automodule:: data_juicer.core.data - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.executor module ---------------------------------- - -.. automodule:: data_juicer.core.executor - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.exporter module ---------------------------------- - -.. automodule:: data_juicer.core.exporter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.tracer module -------------------------------- - -.. automodule:: data_juicer.core.tracer - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.core - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.format.rst.txt b/_sources/data_juicer.format.rst.txt deleted file mode 100644 index 575a5b16a..000000000 --- a/_sources/data_juicer.format.rst.txt +++ /dev/null @@ -1,77 +0,0 @@ -data\_juicer.format package -=========================== - -Submodules ----------- - -data\_juicer.format.csv\_formatter module ------------------------------------------ - -.. automodule:: data_juicer.format.csv_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.formatter module ------------------------------------- - -.. automodule:: data_juicer.format.formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.json\_formatter module ------------------------------------------- - -.. automodule:: data_juicer.format.json_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.load module -------------------------------- - -.. automodule:: data_juicer.format.load - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.mixture\_formatter module ---------------------------------------------- - -.. automodule:: data_juicer.format.mixture_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.parquet\_formatter module ---------------------------------------------- - -.. automodule:: data_juicer.format.parquet_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.text\_formatter module ------------------------------------------- - -.. automodule:: data_juicer.format.text_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.tsv\_formatter module ------------------------------------------ - -.. automodule:: data_juicer.format.tsv_formatter - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.format - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.ops.common.rst.txt b/_sources/data_juicer.ops.common.rst.txt deleted file mode 100644 index be34ff5bf..000000000 --- a/_sources/data_juicer.ops.common.rst.txt +++ /dev/null @@ -1,29 +0,0 @@ -data\_juicer.ops.common package -=============================== - -Submodules ----------- - -data\_juicer.ops.common.helper\_func module -------------------------------------------- - -.. automodule:: data_juicer.ops.common.helper_func - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.common.special\_characters module --------------------------------------------------- - -.. automodule:: data_juicer.ops.common.special_characters - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.ops.common - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.ops.deduplicator.rst.txt b/_sources/data_juicer.ops.deduplicator.rst.txt deleted file mode 100644 index d30ce1dad..000000000 --- a/_sources/data_juicer.ops.deduplicator.rst.txt +++ /dev/null @@ -1,37 +0,0 @@ -data\_juicer.ops.deduplicator package -===================================== - -Submodules ----------- - -data\_juicer.ops.deduplicator.document\_deduplicator module ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.deduplicator.document_deduplicator - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.deduplicator.document\_minhash\_deduplicator module --------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.deduplicator.document_minhash_deduplicator - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.deduplicator.document\_simhash\_deduplicator module --------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.deduplicator.document_simhash_deduplicator - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.ops.deduplicator - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.ops.filter.rst.txt b/_sources/data_juicer.ops.filter.rst.txt deleted file mode 100644 index 64e449177..000000000 --- a/_sources/data_juicer.ops.filter.rst.txt +++ /dev/null @@ -1,133 +0,0 @@ -data\_juicer.ops.filter package -=============================== - -Submodules ----------- - -data\_juicer.ops.filter.alphanumeric\_filter module ---------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.alphanumeric_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.average\_line\_length\_filter module ------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.average_line_length_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.character\_repetition\_filter module ------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.character_repetition_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.flagged\_words\_filter module ------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.flagged_words_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.language\_id\_score\_filter module ----------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.language_id_score_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.maximum\_line\_length\_filter module ------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.maximum_line_length_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.perplexity\_filter module -------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.perplexity_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.special\_characters\_filter module ----------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.special_characters_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.specified\_field\_filter module -------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.specified_field_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.specified\_numeric\_field\_filter module ----------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.specified_numeric_field_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.stopwords\_filter module ------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.stopwords_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.suffix\_filter module ---------------------------------------------- - -.. automodule:: data_juicer.ops.filter.suffix_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.text\_length\_filter module ---------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.text_length_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.word\_num\_filter module ------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.word_num_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.word\_repetition\_filter module -------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.word_repetition_filter - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.ops.filter - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.ops.mapper.rst.txt b/_sources/data_juicer.ops.mapper.rst.txt deleted file mode 100644 index c8688614b..000000000 --- a/_sources/data_juicer.ops.mapper.rst.txt +++ /dev/null @@ -1,149 +0,0 @@ -data\_juicer.ops.mapper package -=============================== - -Submodules ----------- - -data\_juicer.ops.mapper.clean\_copyright\_mapper module -------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_copyright_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_email\_mapper module ---------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_email_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_html\_mapper module --------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_html_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_ip\_mapper module ------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_ip_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_links\_mapper module ---------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_links_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.expand\_macro\_mapper module ----------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.expand_macro_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.fix\_unicode\_mapper module ---------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.fix_unicode_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.punctuation\_normalization\_mapper module ------------------------------------------------------------------ - -.. automodule:: data_juicer.ops.mapper.punctuation_normalization_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_bibliography\_mapper module ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.mapper.remove_bibliography_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_comments\_mapper module -------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_comments_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_header\_mapper module ------------------------------------------------------ - -.. automodule:: data_juicer.ops.mapper.remove_header_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_long\_words\_mapper module ----------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_long_words_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_specific\_chars\_mapper module --------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_specific_chars_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_table\_text\_mapper module ----------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_table_text_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_words\_with\_incorrect\_substrings\_mapper module ---------------------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.sentence\_split\_mapper module ------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.sentence_split_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.whitespace\_normalization\_mapper module ----------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.whitespace_normalization_mapper - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.ops.mapper - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.ops.rst.txt b/_sources/data_juicer.ops.rst.txt deleted file mode 100644 index f25068b50..000000000 --- a/_sources/data_juicer.ops.rst.txt +++ /dev/null @@ -1,41 +0,0 @@ -data\_juicer.ops package -======================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - data_juicer.ops.common - data_juicer.ops.deduplicator - data_juicer.ops.filter - data_juicer.ops.mapper - data_juicer.ops.selector - -Submodules ----------- - -data\_juicer.ops.base\_op module --------------------------------- - -.. automodule:: data_juicer.ops.base_op - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.load module ----------------------------- - -.. automodule:: data_juicer.ops.load - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.ops - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.ops.selector.rst.txt b/_sources/data_juicer.ops.selector.rst.txt deleted file mode 100644 index 266b47408..000000000 --- a/_sources/data_juicer.ops.selector.rst.txt +++ /dev/null @@ -1,29 +0,0 @@ -data\_juicer.ops.selector package -================================= - -Submodules ----------- - -data\_juicer.ops.selector.frequency\_specified\_field\_selector module ----------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.selector.frequency_specified_field_selector - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.selector.topk\_specified\_field\_selector module ------------------------------------------------------------------ - -.. automodule:: data_juicer.ops.selector.topk_specified_field_selector - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.ops.selector - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.rst.txt b/_sources/data_juicer.rst.txt deleted file mode 100644 index c305d1dd0..000000000 --- a/_sources/data_juicer.rst.txt +++ /dev/null @@ -1,23 +0,0 @@ -data\_juicer package -==================== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - data_juicer.analysis - data_juicer.config - data_juicer.core - data_juicer.format - data_juicer.ops - data_juicer.utils - -Module contents ---------------- - -.. automodule:: data_juicer - :members: - :undoc-members: - :show-inheritance: diff --git a/_sources/data_juicer.utils.rst.txt b/_sources/data_juicer.utils.rst.txt deleted file mode 100644 index 65b8d1208..000000000 --- a/_sources/data_juicer.utils.rst.txt +++ /dev/null @@ -1,69 +0,0 @@ -data\_juicer.utils package -========================== - -Submodules ----------- - -data\_juicer.utils.asset\_utils module --------------------------------------- - -.. automodule:: data_juicer.utils.asset_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.cache\_utils module --------------------------------------- - -.. automodule:: data_juicer.utils.cache_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.ckpt\_utils module -------------------------------------- - -.. automodule:: data_juicer.utils.ckpt_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.file\_utils module -------------------------------------- - -.. automodule:: data_juicer.utils.file_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.logger\_utils module ---------------------------------------- - -.. automodule:: data_juicer.utils.logger_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.model\_utils module --------------------------------------- - -.. automodule:: data_juicer.utils.model_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.registry module ----------------------------------- - -.. automodule:: data_juicer.utils.registry - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: data_juicer.utils - :members: - :undoc-members: - :show-inheritance: diff --git a/data_juicer.analysis.html b/data_juicer.analysis.html deleted file mode 100644 index 2ec601f20..000000000 --- a/data_juicer.analysis.html +++ /dev/null @@ -1,359 +0,0 @@ - - - - - - - data_juicer.analysis package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.analysis package

-
-

Submodules

-
-
-

data_juicer.analysis.column_wise_analysis module

-
-
-class data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]
-

Bases: object

-

Apply analysis on each column of stats respectively.

-
-
-analyse(show_percentiles=False, show=False)[source]
-

Apply analysis and draw the analysis figure for stats.

-
-
Parameters:
-
    -
  • show_percentiles – whether to show the percentile line in -each sub-figure. If it’s true, there will be several red -lines to indicate the quantiles of the stats distributions

  • -
  • show – whether to show in a single window after drawing

  • -
-
-
Returns:
-

-
-
-
- -
-
-draw_box(ax, data, save_path, percentiles=None, show=False)[source]
-

Draw the box plot for the data.

-
-
Parameters:
-
    -
  • ax – the axes to draw

  • -
  • data – data to draw

  • -
  • save_path – the path to save the box figure

  • -
  • percentiles – the overall analysis result of the data -including percentile information

  • -
  • show – whether to show in a single window after drawing

  • -
-
-
Returns:
-

-
-
-
- -
-
-draw_hist(ax, data, save_path, percentiles=None, show=False)[source]
-

Draw the histogram for the data.

-
-
Parameters:
-
    -
  • ax – the axes to draw

  • -
  • data – data to draw

  • -
  • save_path – the path to save the histogram figure

  • -
  • percentiles – the overall analysis result of the data -including percentile information

  • -
  • show – whether to show in a single window after drawing

  • -
-
-
Returns:
-

-
-
-
- -
- -
-
-data_juicer.analysis.column_wise_analysis.get_row_col(total_num, factor=2)[source]
-

Given the total number of stats figures, get the “best” number of rows and -columns. This function is needed when we need to store all stats figures -into one image.

-
-
Parameters:
-
    -
  • total_num – Total number of stats figures

  • -
  • factor – Number of sub-figure types in each figure. In -default, it’s 2, which means there are histogram and box plot -for each stat figure

  • -
-
-
Returns:
-

“best” number of rows and columns, and the grid list

-
-
-
- -
-
-

data_juicer.analysis.diversity_analysis module

-
-
-class data_juicer.analysis.diversity_analysis.DiversityAnalysis(dataset, output_path, lang_or_model='en')[source]
-

Bases: object

-

Apply diversity analysis for each sample and get an overall analysis -result.

-
-
-analyse(lang_or_model=None, column_name='text', postproc_func=<function get_diversity>, **postproc_kwarg)[source]
-

Apply diversity analysis on the whole dataset.

-
-
Parameters:
-
    -
  • lang_or_model – the diversity model or a specific language -used to load the diversity model

  • -
  • column_name – the name of column to be analysed

  • -
  • postproc_func – function to analyse diversity. In default, -it’s function get_diversity

  • -
  • postproc_kwarg – arguments of the postproc_func

  • -
-
-
Returns:
-

-
-
-
- -
-
-compute(lang_or_model=None, column_name='text')[source]
-

Apply lexical tree analysis on each sample.

-
-
Parameters:
-
    -
  • lang_or_model – the diversity model or a specific language -used to load the diversity model

  • -
  • column_name – the name of column to be analysed

  • -
-
-
Returns:
-

the analysis result.

-
-
-
- -
- -
-
-data_juicer.analysis.diversity_analysis.find_root_verb_and_its_dobj(tree_root)[source]
-

Find the verb and its object closest to the root.

-
-
Parameters:
-

tree_root – the root of lexical tree

-
-
Returns:
-

valid verb and its object.

-
-
-
- -
-
-data_juicer.analysis.diversity_analysis.find_root_verb_and_its_dobj_in_string(nlp, s, first_sent=True)[source]
-

Find the verb and its object closest to the root of lexical tree of input -string.

-
-
Parameters:
-
    -
  • nlp – the diversity model to analyse the diversity strings

  • -
  • s – the string to be analysed

  • -
  • first_sent – whether to analyse the first sentence in the -input string only. If it’s true, return the analysis result of -the first sentence no matter it’s valid or not. If it’s false, -return the first valid result over all sentences

  • -
-
-
Returns:
-

valid verb and its object of this string

-
-
-
- -
-
-data_juicer.analysis.diversity_analysis.get_diversity(dataset, top_k_verbs=20, top_k_nouns=4, **kwargs)[source]
-

Given the lexical tree analysis result, return the diversity results.

-
-
Parameters:
-
    -
  • dataset – lexical tree analysis result

  • -
  • top_k_verbs – only keep the top_k_verbs largest verb groups

  • -
  • top_k_nouns – only keep the top_k_nouns largest noun groups -for each verb group

  • -
  • kwargs – extra args

  • -
-
-
Returns:
-

the diversity results

-
-
-
- -
-
-

data_juicer.analysis.overall_analysis module

-
-
-class data_juicer.analysis.overall_analysis.OverallAnalysis(dataset, output_path)[source]
-

Bases: object

-

Apply analysis on the overall stats, including mean, std, quantiles, -etc.

-
-
-analyse(percentiles=[])[source]
-

Apply overall analysis on the whole dataset based on the describe -method of pandas.

-
-
Parameters:
-

percentiles – percentiles to analyse

-
-
Returns:
-

the overall analysis result.

-
-
-
- -
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.config.html b/data_juicer.config.html deleted file mode 100644 index e69d52a9f..000000000 --- a/data_juicer.config.html +++ /dev/null @@ -1,203 +0,0 @@ - - - - - - - data_juicer.config package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.config package

-
-

Submodules

-
-
-

data_juicer.config.config module

-
-
-data_juicer.config.config.config_backup(cfg)[source]
-
- -
-
-data_juicer.config.config.display_config(cfg)[source]
-
- -
-
-data_juicer.config.config.init_configs(args=None)[source]
-
-
initialize the jsonargparse parser and parse configs from one of:
    -
  1. POSIX-style commands line args;

  2. -
  3. config files in yaml (json and jsonnet supersets);

  4. -
  5. environment variables

  6. -
  7. hard-coded defaults

  8. -
-
-
-
-
Parameters:
-

args – list of params, e.g., [’–conifg’, ‘cfg.yaml’], defaut None.

-
-
Returns:
-

a global cfg object used by the Executor or Analyser

-
-
-
- -
-
-data_juicer.config.config.init_setup_from_cfg(cfg)[source]
-

Do some extra setup tasks after parsing config file or command line.

-
    -
  1. create working directory and a log directory

  2. -
  3. update cache directory

  4. -
  5. update checkpoint and temp_dir of tempfile

  6. -
-
-
Parameters:
-
    -
  • cfg – a original cfg

  • -
  • cfg – a updated cfg

  • -
-
-
-
- -
-
-data_juicer.config.config.sort_op_by_types_and_names(op_name_classes)[source]
-

Split ops items by op type and sort them to sub-ops by name, then concat -together.

-
-
Parameters:
-

op_name_classes – a list of op modules

-
-
Returns:
-

sorted op list , each item is a pair of op_name and -op_class

-
-
-
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.core.html b/data_juicer.core.html deleted file mode 100644 index 404a492a5..000000000 --- a/data_juicer.core.html +++ /dev/null @@ -1,505 +0,0 @@ - - - - - - - data_juicer.core package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.core package

-
-

Submodules

-
-
-

data_juicer.core.analyser module

-
-
-class data_juicer.core.analyser.Analyser(cfg=None)[source]
-

Bases: object

-

This Analyser class is used to analyse a specific dataset.

-

It will compute stats for all filter ops in the config file, apply -multiple analysis (e.g. OverallAnalysis, ColumnWiseAnalysis, etc.) -on these stats, and generate the analysis results (stats tables, -distribution figures, etc.) to help users understand the input -dataset better.

-
-
-run(load_data_np=None)[source]
-

Running the dataset analysis pipeline.

-
-
Parameters:
-

load_data_np – number of workers when loading the dataset.

-
-
Returns:
-

analysed dataset.

-
-
-
- -
- -
-
-

data_juicer.core.data module

-
-
-class data_juicer.core.data.NestedDataset(*args, **kargs)[source]
-

Bases: Dataset

-

Enhanced HuggingFace-Dataset for better usability and efficiency.

-
-
-add_column(*args, **kargs)[source]
-

Override the add column func, such that the processed samples -can be accessed by nested manner.

-
- -
-
-cleanup_cache_files()[source]
-

Override the cleanup_cache_files func, clear raw and compressed -cache files.

-
- -
-
-filter(*args, **kargs)[source]
-

Override the filter func, which is called by most common operations, -such that the processed samples can be accessed by nested manner.

-
- -
-
-classmethod from_dict(*args, **kargs)[source]
-

Override the from_dict func, which is called by most from_xx -constructors, such that the constructed dataset object is -NestedDataset.

-
- -
-
-map(*args, **kargs)[source]
-

Override the map func, which is called by most common operations, -such that the processed samples can be accessed by nested manner.

-
- -
-
-remove_columns(*args, **kargs)[source]
-

Override the remove columns func, such that the processed samples -can be accessed by nested manner.

-
- -
-
-select(*args, **kargs)[source]
-

Override the select func, such that selected samples can be accessed -by nested manner.

-
- -
-
-select_columns(*args, **kargs)[source]
-

Override the select columns func, such that the processed samples -can be accessed by nested manner.

-
- -
- -
-
-class data_juicer.core.data.NestedDatasetDict(*args, **kargs)[source]
-

Bases: DatasetDict

-

Enhanced HuggingFace-DatasetDict for better usability and efficiency.

-
-
-map(**args)[source]
-

Override the map func, which is called by most common operations, -such that the processed samples can be accessed by nested manner.

-
- -
- -
-
-class data_juicer.core.data.NestedQueryDict(*args, **kargs)[source]
-

Bases: dict

-

Enhanced dict for better usability.

-
- -
-
-data_juicer.core.data.nested_obj_factory(obj)[source]
-

Use nested classes to wrap the input object.

-
-
Parameters:
-

obj – object to be nested.

-
-
Returns:
-

nested object

-
-
-
- -
-
-data_juicer.core.data.nested_query(root_obj: NestedDatasetDict | NestedDataset | NestedQueryDict, key)[source]
-

Find item from a given object, by first checking flatten layer, then -checking nested layers.

-
-
Parameters:
-
    -
  • root_obj – the object

  • -
  • key – the stored item to be queried, e.g., “meta” or -“meta.date”

  • -
-
-
Returns:
-

-
-
-
- -
-
-data_juicer.core.data.wrap_func_with_nested_access(f)[source]
-

Before conducting actual function f, wrap its args and kargs into nested -ones.

-
-
Parameters:
-

f – function to be wrapped.

-
-
Returns:
-

wrapped function

-
-
-
- -
-
-

data_juicer.core.executor module

-
-
-class data_juicer.core.executor.Executor(cfg=None)[source]
-

Bases: object

-

This Executor class is used to process a specific dataset.

-

It will load the dataset and unify the format, then apply all the -ops in the config file in order and generate a processed dataset.

-
-
-run(load_data_np=None)[source]
-

Running the dataset process pipeline.

-
-
Parameters:
-

load_data_np – number of workers when loading the dataset.

-
-
Returns:
-

processed dataset.

-
-
-
- -
- -
-
-

data_juicer.core.exporter module

-
-
-class data_juicer.core.exporter.Exporter(export_path, export_shard_size=0, export_in_parallel=True, num_proc=1, export_ds=True, export_stats=True)[source]
-

Bases: object

-

The Exporter class is used to export a dataset to files of specific -format.

-
-
-GiB = 1073741824
-
- -
-
-KiB = 1024
-
- -
-
-MiB = 1048576
-
- -
-
-TiB = 1099511627776
-
- -
-
-export(dataset)[source]
-

Export method for a dataset.

-
-
Parameters:
-

dataset – the dataset to export.

-
-
Returns:
-

-
-
-
- -
-
-static to_jsonl(dataset, export_path, num_proc=1, **kwargs)[source]
-

Export method for json/jsonl target files.

-
-
Parameters:
-
    -
  • dataset – the dataset to export.

  • -
  • export_path – the path to store the exported dataset.

  • -
  • num_proc – the number of processes used to export the dataset.

  • -
  • kwargs – extra arguments.

  • -
-
-
Returns:
-

-
-
-
- -
-
-static to_parquet(dataset, export_path, **kwargs)[source]
-

Export method for parquet target files.

-
-
Parameters:
-
    -
  • dataset – the dataset to export.

  • -
  • export_path – the path to store the exported dataset.

  • -
  • kwargs – extra arguments.

  • -
-
-
Returns:
-

-
-
-
- -
- -
-
-

data_juicer.core.tracer module

-
-
-class data_juicer.core.tracer.Tracer(work_dir, show_num=10)[source]
-

Bases: object

-

The tracer to trace the sample changes before and after an operator -process.

-

The comparison results will be stored in the work directory.

-
-
-trace_batch_mapper(op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str)[source]
-

Compare datasets before and after a BatchMapper.

-

This will mainly show the new samples augmented by the BatchMapper

-
-
Parameters:
-
    -
  • op_name – the op name of mapper

  • -
  • previous_ds – dataset before the mapper process

  • -
  • processed_ds – dataset processed by the mapper

  • -
  • text_key – which text_key to trace

  • -
-
-
Returns:
-

-
-
-
- -
-
-trace_deduplicator(op_name: str, dup_pairs: list)[source]
-

Compare datasets before and after a Deduplicator.

-

This will mainly show the near-duplicate sample pairs extracted -by the Deduplicator. Different from the other two trace methods, -the trace process for deduplicator is embedded into the process -method of deduplicator, but the other two trace methods are -independent of the process method of mapper and filter operators

-
-
Parameters:
-
    -
  • op_name – the op name of deduplicator

  • -
  • dup_pairs – duplicate sample pairs obtained from -deduplicator

  • -
-
-
Returns:
-

-
-
-
- -
-
-trace_filter(op_name: str, previous_ds: Dataset, processed_ds: Dataset)[source]
-

Compare datasets before and after a Filter.

-

This will mainly show the filtered samples by the Filter

-
-
Parameters:
-
    -
  • op_name – the op name of filter

  • -
  • previous_ds – dataset before the filter process

  • -
  • processed_ds – dataset processed by the filter

  • -
-
-
Returns:
-

-
-
-
- -
-
-trace_mapper(op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str)[source]
-

Compare datasets before and after a Mapper.

-

This will mainly show the different sample pairs due to the -modification by the Mapper

-
-
Parameters:
-
    -
  • op_name – the op name of mapper

  • -
  • previous_ds – dataset before the mapper process

  • -
  • processed_ds – dataset processed by the mapper

  • -
  • text_key – which text_key to trace

  • -
-
-
Returns:
-

-
-
-
- -
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.format.html b/data_juicer.format.html deleted file mode 100644 index ef6f31201..000000000 --- a/data_juicer.format.html +++ /dev/null @@ -1,437 +0,0 @@ - - - - - - - data_juicer.format package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.format package

-
-

Submodules

-
-
-

data_juicer.format.csv_formatter module

-
-
-class data_juicer.format.csv_formatter.CsvFormatter(dataset_path, suffixes=None, **kwargs)[source]
-

Bases: LocalFormatter

-

The class is used to load and format csv-type files.

-

Default suffixes is [‘.csv’]

-
-
-SUFFIXES = ['.csv']
-
- -
- -
-
-

data_juicer.format.formatter module

-
-
-class data_juicer.format.formatter.BaseFormatter[source]
-

Bases: object

-

Base class to load dataset.

-
-
-load_dataset(*args) Dataset[source]
-
- -
- -
-
-class data_juicer.format.formatter.LocalFormatter(dataset_path: str, type: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]
-

Bases: BaseFormatter

-

The class is used to load a dataset from local files or local -directory.

-
-
-load_dataset(num_proc: int = 1) Dataset[source]
-

Load a dataset from dataset file or dataset directory, and unify its -format.

-
-
Parameters:
-
    -
  • num_proc – number of processes when loading the dataset

  • -
  • global_cfg – global cfg used in consequent processes,

  • -
-
-
Returns:
-

formatted dataset

-
-
-
- -
- -
-
-class data_juicer.format.formatter.RemoteFormatter(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]
-

Bases: BaseFormatter

-

The class is used to load a dataset from repository of huggingface -hub.

-
-
-load_dataset(num_proc: int = 1) Dataset[source]
-

Load a dataset from HuggingFace, and unify its format.

-
-
Parameters:
-
    -
  • num_proc – number of processes when loading the dataset

  • -
  • global_cfg – the global cfg used in consequent processes,

  • -
-
-
Returns:
-

formatted dataset

-
-
-
- -
- -
-
-data_juicer.format.formatter.add_suffixes(datasets: DatasetDict) Dataset[source]
-

Add suffix filed to datasets.

-
-
Parameters:
-

datasets – a DatasetDict object

-
-
Returns:
-

datasets with suffix features.

-
-
-
- -
-
-data_juicer.format.formatter.load_formatter(dataset_path, text_keys=None, suffixes=None, add_suffix=False, **kwargs) BaseFormatter[source]
-

Load the appropriate formatter for different types of data formats.

-
-
Parameters:
-
    -
  • dataset_path – Path to dataset file or dataset directory

  • -
  • text_keys – key names of field that stores sample text. -Default: None

  • -
  • suffixes – the suffix of files that will be read. Default: -None

  • -
-
-
Returns:
-

a dataset formatter.

-
-
-
- -
-
-data_juicer.format.formatter.unify_format(dataset: Dataset, text_keys: List[str] | str = 'text', num_proc: int = 1) Dataset[source]
-

Get an unified internal format, conduct the following modifications.

-
    -
  1. check keys of dataset

  2. -
  3. filter out those samples with empty or None text

  4. -
-
-
Parameters:
-
    -
  • dataset – input dataset

  • -
  • text_keys – original text key(s) of dataset.

  • -
  • num_proc – number of processes for mapping

  • -
  • global_cfg – the global cfg used in consequent processes, -since cfg.text_key may be modified after unifying

  • -
-
-
Returns:
-

unified_format_dataset

-
-
-
- -
-
-

data_juicer.format.json_formatter module

-
-
-class data_juicer.format.json_formatter.JsonFormatter(dataset_path, suffixes=None, **kwargs)[source]
-

Bases: LocalFormatter

-

The class is used to load and format json-type files.

-

Default suffixes is [‘.json’, ‘.jsonl’, ‘.jsonl.zst’]

-
-
-SUFFIXES = ['.json', '.jsonl', '.jsonl.zst']
-
- -
- -
-
-

data_juicer.format.load module

-
-
-data_juicer.format.load.load_formatter(dataset_path, text_keys=None, suffixes=[], add_suffix=False, **kwargs) BaseFormatter[source]
-

Load mixture formatter for multiple different data formats with an optional -weight(default 1.0) according to their formats.

-
-
Parameters:
-
    -
  • dataset_path – path to a dataset file or a dataset directory

  • -
  • text_keys – key names of field that stores sample text. -Default: None

  • -
  • suffixes – files with specified suffixes to be processed.

  • -
  • add_suffix – whether to add the file suffix to dataset meta -info

  • -
-
-
Returns:
-

a dataset formatter.

-
-
-
- -
-
-

data_juicer.format.mixture_formatter module

-
-
-class data_juicer.format.mixture_formatter.MixtureFormatter(dataset_path: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys=None, add_suffix=False, **kwargs)[source]
-

Bases: BaseFormatter

-

The class mixes multiple datasets by randomly selecting samples from -every dataset and merging them, and then exports the merged datasset as a -new mixed dataset.

-
-
-load_dataset(num_proc: int = 1) Dataset[source]
-

Load a mixed dataset.

-
-
Parameters:
-

num_proc – number of processes when loading the dataset

-
-
Returns:
-

mixed dataset

-
-
-
- -
- -
-
-

data_juicer.format.parquet_formatter module

-
-
-class data_juicer.format.parquet_formatter.ParquetFormatter(dataset_path, suffixes=None, **kwargs)[source]
-

Bases: LocalFormatter

-

The class is used to load and format parquet-type files.

-

Default suffixes is [‘.parquet’]

-
-
-SUFFIXES = ['.parquet']
-
- -
- -
-
-

data_juicer.format.text_formatter module

-
-
-class data_juicer.format.text_formatter.TextFormatter(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]
-

Bases: LocalFormatter

-

The class is used to load and format text-type files.

-

e.g. [‘.txt’, ‘.pdf’, ‘.cpp’, ‘.docx’]

-
-
-SUFFIXES = ['.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', '.m', '.smali']
-
- -
-
-load_dataset(num_proc: int = 1) Dataset[source]
-

Load a dataset from local text-type files.

-
-
Parameters:
-

num_proc – number of processes when loading the dataset

-
-
Returns:
-

unified_format_dataset.

-
-
-
- -
- -
-
-data_juicer.format.text_formatter.extract_txt_from_docx(fn, tgt_path)[source]
-

Extract text from a docx file and save to target path.

-
-
Parameters:
-
    -
  • fn – path to input pdf file

  • -
  • tgt_path – path to save text file.

  • -
-
-
-
- -
-
-data_juicer.format.text_formatter.extract_txt_from_pdf(fn, tgt_path)[source]
-

Extract text from a pdf file and save to target path.

-
-
Parameters:
-
    -
  • fn – path to input pdf file

  • -
  • tgt_path – path to save text file.

  • -
-
-
-
- -
-
-

data_juicer.format.tsv_formatter module

-
-
-class data_juicer.format.tsv_formatter.TsvFormatter(dataset_path, suffixes=None, **kwargs)[source]
-

Bases: LocalFormatter

-

The class is used to load and format tsv-type files.

-

Default suffixes is [‘.tsv’]

-
-
-SUFFIXES = ['.tsv']
-
- -
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.html b/data_juicer.html deleted file mode 100644 index 2e15fa434..000000000 --- a/data_juicer.html +++ /dev/null @@ -1,474 +0,0 @@ - - - - - - - data_juicer package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer package

-
-

Subpackages

-
- -
-
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.ops.common.html b/data_juicer.ops.common.html deleted file mode 100644 index 478b144ec..000000000 --- a/data_juicer.ops.common.html +++ /dev/null @@ -1,305 +0,0 @@ - - - - - - - data_juicer.ops.common package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.ops.common package

-
-

Submodules

-
-
-

data_juicer.ops.common.helper_func module

-
-
-class data_juicer.ops.common.helper_func.UnionFind[source]
-

Bases: object

-
-
-find(x)[source]
-
- -
-
-union(x, y)[source]
-
- -
- -
-
-data_juicer.ops.common.helper_func.get_sentences_from_document(document, model_func=None)[source]
-

Get sentences from a document.

-
-
Parameters:
-
    -
  • document – document that need to split sentences

  • -
  • model_func – function of sentence model, if specified, the -function will be used for spliting document into different -sentences.

  • -
-
-
Returns:
-

document with the sentences separated by ‘\n’

-
-
-
- -
-
-data_juicer.ops.common.helper_func.get_words_from_document(document, token_func=None, new_line=True, tab=True)[source]
-

Get words from a document. Useful to compute ratios, like the -stopwords ratio.

-
-
Parameters:
-
    -
  • document – document that need to split words

  • -
  • token_func – function of tokenizer, if specified, the function -will be used for split document into different tokens.

  • -
  • new_line – whether to use `\n’ to split words

  • -
  • tab – whether to use ‘\t’ to split words

  • -
-
-
Returns:
-

word list obtained from document

-
-
-
- -
-
-data_juicer.ops.common.helper_func.merge_on_whitespace_tab_newline(sentences)[source]
-

This method is used to merge different levels of sub-sentences into one -document. Invert the method split_on_newline_tab_whitespace. Removes -concatenated separators.

-
-
Parameters:
-

sentences – sentence list to be merged

-
-
Returns:
-

document obtained after merging sub-sentences

-
-
-
- -
-
-data_juicer.ops.common.helper_func.split_on_newline_tab_whitespace(document)[source]
-

This method is used to split the document into different levels of sub- -sentences.

-

First split on “\n”, then on “\t”, then on ” “. -:param document: document to be splited -:return: setence list obtained after splitting document

-
- -
-
-data_juicer.ops.common.helper_func.split_on_whitespace(document, new_line=False, tab=False)[source]
-

This method also removes concatenated spaces.

-
-
Parameters:
-
    -
  • document – document to be splited

  • -
  • new_line – whether to split document with ‘\n’

  • -
  • tag – whether to split document with ‘\t’

  • -
-
-
Returns:
-

word list obtained after splitting document

-
-
-
- -
-
-data_juicer.ops.common.helper_func.strip(document, strip_characters)[source]
-

Way faster than document.strip(strip_characters) since strip_characters is -now a set instead of a str, and it contains a lot of elements (all the -emojis).

-
-
Parameters:
-
    -
  • document – document to be processed

  • -
  • strip_characters – characters uesd for stripping document

  • -
-
-
Returns:
-

stripped document

-
-
-
- -
-
-data_juicer.ops.common.helper_func.words_augmentation(words, group_size, join_char)[source]
-

Augment words, especially for Chinese (without a space between words) and -Vietnamese (with a space between syllables).

-
-
Parameters:
-
    -
  • word – word list to be augmented

  • -
  • group_size – the size of word groups that need to be merged

  • -
  • join_char – characters to be added between word group

  • -
-
-
Returns:
-

word list after augment

-
-
-
- -
-
-data_juicer.ops.common.helper_func.words_refinement(words, lower_case=False, strip_chars=None, use_words_aug=False, words_aug_group_sizes=[2], words_aug_join_char='')[source]
-

Refine split words. Non reversible since the document is split on -multiple characters, words are stripped of special characters and -characters are converted to lower case.

-
-
Parameters:
-
    -
  • words – the word list to be augmented

  • -
  • lower_case – whether to convert word to lowercase

  • -
  • strip_chars – chars that need to be stripped in words

  • -
  • use_words_aug – whether to use word augmentation

  • -
  • words_aug_group_sizes – the size of word groups that need to -be merged

  • -
  • words_aug_join_char – characters to be added between word -group

  • -
-
-
Returns:
-

refined words or word list

-
-
-
- -
-
-

data_juicer.ops.common.special_characters module

-
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.ops.deduplicator.html b/data_juicer.ops.deduplicator.html deleted file mode 100644 index eda4d502a..000000000 --- a/data_juicer.ops.deduplicator.html +++ /dev/null @@ -1,336 +0,0 @@ - - - - - - - data_juicer.ops.deduplicator package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.ops.deduplicator package

-
-

Submodules

-
-
-

data_juicer.ops.deduplicator.document_deduplicator module

-
-
-class data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
-

Bases: Deduplicator

-

Deduplicator to deduplicate samples at document-level using exact matching.

-

Using md5 hash to deduplicate samples.

-
-
-compute_hash(sample)[source]
-

Compute md5 hash values for the sample.

-
-
Parameters:
-

sample – input sample

-
-
Returns:
-

sample with md5 hash value.

-
-
-
- -
-
-process(dataset, show_num=0)[source]
-

For doc-level, dataset –> dataset.

-
-
Parameters:
-
    -
  • dataset – input dataset

  • -
  • show_num – number of traced samples used when tracer is -open.

  • -
-
-
Returns:
-

deduplicated dataset and the sampled duplicate pairs.

-
-
-
- -
- -
-
-

data_juicer.ops.deduplicator.document_minhash_deduplicator module

-
-
-class data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator(tokenization: str = 'space', window_size: PositiveInt = 5, lowercase: bool = True, ignore_pattern: str | None = None, num_permutations: PositiveInt = 256, jaccard_threshold: ClosedUnitInterval = 0.7, num_bands: PositiveInt | None = None, num_rows_per_band: PositiveInt | None = None, *args, **kwargs)[source]
-

Bases: Deduplicator

-

Deduplicator to deduplicate samples at document-level using MinHashLSH.

-

Different from simhash, minhash is stored as bytes, so they won’t be -kept in the final dataset.

-
-
-compute_hash(sample)[source]
-

Compute minhash values for the sample.

-
-
Parameters:
-

sample – input sample

-
-
Returns:
-

sample with minhash value.

-
-
-
- -
-
-process(dataset, show_num=0)[source]
-

For doc-level, dataset –> dataset.

-
-
Parameters:
-
    -
  • dataset – input dataset

  • -
  • show_num – number of traced samples used when tracer is -open.

  • -
-
-
Returns:
-

deduplicated dataset and the sampled duplicate pairs.

-
-
-
- -
- -
-
-data_juicer.ops.deduplicator.document_minhash_deduplicator.optimal_param(threshold: float, num_perm: int, false_positive_weight: float = 0.5, false_negative_weight: float = 0.5)[source]
-

Compute the optimal MinHashLSH parameter that minimizes the weighted sum -of probabilities of false positive and false negative, taken from -datasketch.

-
-
Parameters:
-
    -
  • threshold – float. The threshold for similarity

  • -
  • num_perm – int. The number of permutations

  • -
  • false_positive_weight – float. The weight of false positive

  • -
  • false_negative_weight – float. The weight of false negative

  • -
-
-
Returns:
-

Tuple[int, int]. The optimal b and r parameters. The number of -bands, and the number of rows per band respectively

-
-
-
- -
-
-data_juicer.ops.deduplicator.document_minhash_deduplicator.sha1_hash32(data)[source]
-

Directly taken from datasketch package to avoid dependency.

-
-
Parameters:
-

data (bytes) –

-
-
Return type:
-

int

-
-
-
- -
-
-

data_juicer.ops.deduplicator.document_simhash_deduplicator module

-
-
-class data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator(tokenization: str = 'space', window_size: PositiveInt = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: PositiveInt = 6, hamming_distance: PositiveInt = 4, *args, **kwargs)[source]
-

Bases: Deduplicator

-

Deduplicator to deduplicate samples at document-level using SimHash.

-
-
-compute_hash(sample)[source]
-

Compute simhash values for the sample.

-
-
Parameters:
-

sample – input sample

-
-
Returns:
-

sample with simhash value.

-
-
-
- -
-
-process(dataset, show_num=0)[source]
-

For doc-level, dataset –> dataset.

-
-
Parameters:
-
    -
  • dataset – input dataset

  • -
  • show_num – number of traced samples used when tracer is -open.

  • -
-
-
Returns:
-

deduplicated dataset and the sampled duplicate pairs.

-
-
-
- -
- -
-
-data_juicer.ops.deduplicator.document_simhash_deduplicator.local_num_differing_bits(hash_a, hash_b)[source]
-

Local implementation of calculating the number of different bits between -two integers.

-
-
Parameters:
-
    -
  • hash_a – integer hash value a

  • -
  • hash_b – integer hash value b

  • -
-
-
Returns:
-

number of different bits between input hashes.

-
-
-
- -
-
-data_juicer.ops.deduplicator.document_simhash_deduplicator.num_differing_bits_selector()[source]
-

Select a num_differing_bits method according to the Python version -installed.

-

When Python >= 3.9, the original simhash library cannot be compiled -correctly due to some changes in cython. After fixing this -incompatibility, RecursionError occurs sometimes when calling -simhash.num_differing_bits. So we use our implementation when Python ->= 3.9. Otherwise, we use implementation of simhash.

-
-
Returns:
-

an available num_differing_bits function.

-
-
-
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.ops.filter.html b/data_juicer.ops.filter.html deleted file mode 100644 index 29f3aad15..000000000 --- a/data_juicer.ops.filter.html +++ /dev/null @@ -1,802 +0,0 @@ - - - - - - - data_juicer.ops.filter package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.ops.filter package

-
-

Submodules

-
-
-

data_juicer.ops.filter.alphanumeric_filter module

-
-
-class data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: PositiveFloat = 9223372036854775807, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with alphabet/numeric ratio within a specific -range.

-
-
-compute_stats(sample)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.average_line_length_filter module

-
-
-class data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter(min_len: PositiveInt = 10, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with average line length within a specific -range.

-
-
-compute_stats(sample, context=False)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.character_repetition_filter module

-
-
-class data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter(rep_len: PositiveInt = 10, min_ratio: ClosedUnitInterval = 0.0, max_ratio: ClosedUnitInterval = 0.5, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with char-level n-gram repetition ratio within a -specific range.

-
-
-compute_stats(sample)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.flagged_words_filter module

-
-
-class data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter(lang: str = 'en', tokenization: bool = False, max_ratio: ClosedUnitInterval = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with flagged-word ratio less than a specific max -value.

-
-
-compute_stats(sample, context=False)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.language_id_score_filter module

-
-
-class data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter(lang: str = '', min_score: ClosedUnitInterval = 0.8, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples in a specific language with confidence score -larger than a specific min value.

-
-
-compute_stats(sample)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.maximum_line_length_filter module

-
-
-class data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter(min_len: PositiveInt = 10, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with maximum line length within a specific -range.

-
-
-compute_stats(sample, context=False)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.perplexity_filter module

-
-
-class data_juicer.ops.filter.perplexity_filter.PerplexityFilter(lang: str = 'en', max_ppl: PositiveFloat = 1500, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with perplexity score less than a specific max -value.

-
-
-compute_stats(sample, context=False)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.special_characters_filter module

-
-
-class data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter(min_ratio: ClosedUnitInterval = 0.0, max_ratio: ClosedUnitInterval = 0.25, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with special-char ratio within a specific -range.

-
-
-compute_stats(sample)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.specified_field_filter module

-
-
-class data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter(field_key: str = '', target_value: List | Tuple = [], *args, **kwargs)[source]
-

Bases: Filter

-

Filter based on specified field information.

-

If the specified field information in the sample is not within the -specified target value, the sample will be filtered.

-
-
-compute_stats(sample)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.specified_numeric_field_filter module

-
-
-class data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]
-

Bases: Filter

-

Filter based on specified numeric field information.

-

If the specified numeric information in the sample is not within the -specified range, the sample will be filtered.

-
-
-compute_stats(sample)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-data_juicer.ops.filter.specified_numeric_field_filter.is_number(s)[source]
-
- -
-
-

data_juicer.ops.filter.stopwords_filter module

-
-
-class data_juicer.ops.filter.stopwords_filter.StopWordsFilter(lang: str = 'en', tokenization: bool = False, min_ratio: ClosedUnitInterval = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with stopword ratio larger than a specific min -value.

-
-
-compute_stats(sample, context=False)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.suffix_filter module

-
-
-class data_juicer.ops.filter.suffix_filter.SuffixFilter(suffixes: str | List[str] | Tuple[str] = [], *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with specified suffix.

-
-
-compute_stats(sample)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.text_length_filter module

-
-
-class data_juicer.ops.filter.text_length_filter.TextLengthFilter(min_len: PositiveInt = 10, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with total text length within a specific -range.

-
-
-compute_stats(sample)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.word_num_filter module

-
-
-class data_juicer.ops.filter.word_num_filter.WordNumFilter(lang: str = 'en', tokenization: bool = False, min_num: PositiveInt = 10, max_num: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with total words number within a specific -range.

-
-
-compute_stats(sample, context=False)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

data_juicer.ops.filter.word_repetition_filter module

-
-
-class data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter(lang: str = 'en', tokenization: bool = False, rep_len: PositiveInt = 10, min_ratio: ClosedUnitInterval = 0.0, max_ratio: ClosedUnitInterval = 0.5, *args, **kwargs)[source]
-

Bases: Filter

-

Filter to keep samples with word-level n-gram repetition ratio within a -specific range.

-
-
-compute_stats(sample, context=False)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.ops.html b/data_juicer.ops.html deleted file mode 100644 index 232a8bc0a..000000000 --- a/data_juicer.ops.html +++ /dev/null @@ -1,616 +0,0 @@ - - - - - - - data_juicer.ops package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.ops package

-
-

Subpackages

-
- -
-
-
-

Submodules

-
-
-

data_juicer.ops.base_op module

-
-
-class data_juicer.ops.base_op.Deduplicator(text_key: str | None = None)[source]
-

Bases: object

-
-
-compute_hash(sample)[source]
-

Compute hash values for the sample.

-
-
Parameters:
-

sample – input sample

-
-
Returns:
-

sample with computed hash value.

-
-
-
- -
-
-process(dataset, show_num=0)[source]
-

For doc-level, dataset –> dataset.

-
-
Parameters:
-
    -
  • dataset – input dataset

  • -
  • show_num – number of traced samples used when tracer is -open.

  • -
-
-
Returns:
-

deduplicated dataset and the sampled duplicate pairs.

-
-
-
- -
- -
-
-class data_juicer.ops.base_op.Filter(text_key: str | None = None)[source]
-

Bases: object

-
-
-compute_stats(sample, context=False)[source]
-

Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

-
-
Parameters:
-
    -
  • sample – input sample.

  • -
  • context – whether to store context information of intermediate -vars in the sample temporarily.

  • -
-
-
Returns:
-

sample with computed stats

-
-
-
- -
-
-process(sample)[source]
-

For sample level, sample –> Boolean.

-
-
Parameters:
-

sample – sample to decide whether to filter

-
-
Returns:
-

true for keeping and false for filtering

-
-
-
- -
- -
-
-class data_juicer.ops.base_op.Mapper(text_key: str | None = None)[source]
-

Bases: object

-
-
-is_batched_op()[source]
-
- -
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-class data_juicer.ops.base_op.Selector(text_key: str | None = None)[source]
-

Bases: object

-
-
-process(dataset)[source]
-

Dataset –> dataset.

-
-
Parameters:
-

dataset – input dataset

-
-
Returns:
-

selected dataset.

-
-
-
- -
- -
-
-

data_juicer.ops.load module

-
-
-data_juicer.ops.load.load_ops(process_list, op_fusion=False)[source]
-

Load op list according to the process list from config file.

-
-
Parameters:
-
    -
  • process_list – A process list. Each item is an op name and its -arguments.

  • -
  • op_fusion – whether to fuse ops that share the same intermediate -variables.

  • -
-
-
Returns:
-

The op instance list.

-
-
-
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.ops.mapper.html b/data_juicer.ops.mapper.html deleted file mode 100644 index 44aedac3f..000000000 --- a/data_juicer.ops.mapper.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - data_juicer.ops.mapper package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.ops.mapper package

-
-

Submodules

-
- -
-

data_juicer.ops.mapper.clean_email_mapper module

-
-
-class data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper(*args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to clean email in text samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.clean_html_mapper module

-
-
-class data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper(*args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to clean html code in text samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.clean_ip_mapper module

-
-
-class data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper(*args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to clean ipv4 and ipv6 address in text samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
- -
-

data_juicer.ops.mapper.expand_macro_mapper module

-
-
-class data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper(*args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to expand macro definitions in the document body of Latex -samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.fix_unicode_mapper module

-
-
-class data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper(*args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to fix unicode errors in text samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.punctuation_normalization_mapper module

-
-
-class data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper(*args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to normalize unicode punctuations to English punctuations in text -samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.remove_bibliography_mapper module

-
-
-class data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper(*args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to remove bibliography at the end of documents in Latex -samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.remove_comments_mapper module

-
-
-class data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to remove comments in different kinds of documents.

-

Only support ‘tex’ for now.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.remove_header_mapper module

-
-
-class data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper(drop_no_head: bool = True, *args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to remove headers at the beginning of documents in Latex -samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.remove_long_words_mapper module

-
-
-class data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper(min_len: PositiveInt = 1, max_len: PositiveInt = 9223372036854775807, *args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to remove long words within a specific range.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
-
-should_keep_long_word(word)[source]
-
- -
- -
-
-

data_juicer.ops.mapper.remove_specific_chars_mapper module

-
-
-class data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to clean specific chars in text samples.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.remove_table_text_mapper module

-
-
-class data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper(min_col: from_2_to_20 = 2, max_col: from_2_to_20 = 20, *args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to remove table texts from text samples.

-

Regular expression is used to remove tables in the range of column -number of tables.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper module

-
-
-class data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper(lang: str = 'en', tokenization: bool = False, substrings: List | None = None, *args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to remove words with incorrect substrings.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
-
-should_keep_word_with_incorrect_substrings(word, substrings)[source]
-
- -
- -
-
-

data_juicer.ops.mapper.sentence_split_mapper module

-
-
-class data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper(lang: str = 'en', *args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to split text samples to sentences.

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

data_juicer.ops.mapper.whitespace_normalization_mapper module

-
-
-class data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper(*args, **kwargs)[source]
-

Bases: Mapper

-

Mapper to normalize different kinds of whitespaces to whitespace ‘ ‘ (0x20) -in text samples.

-

Different kinds of whitespaces can be found here: -https://en.wikipedia.org/wiki/Whitespace_character

-
-
-process(sample)[source]
-

For sample level, sample –> sample

-
-
Parameters:
-

sample – sample to process

-
-
Returns:
-

processed sample

-
-
-
- -
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.ops.selector.html b/data_juicer.ops.selector.html deleted file mode 100644 index 89a079181..000000000 --- a/data_juicer.ops.selector.html +++ /dev/null @@ -1,191 +0,0 @@ - - - - - - - data_juicer.ops.selector package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.ops.selector package

-
-

Submodules

-
-
-

data_juicer.ops.selector.frequency_specified_field_selector module

-
-
-class data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector(field_key: str = '', top_ratio: ClosedUnitInterval | None = None, topk: PositiveInt | None = None, reverse: bool = True, *args, **kwargs)[source]
-

Bases: Selector

-

Selector to select samples based on the sorted frequency of specified -field.

-
-
-process(dataset)[source]
-

Dataset –> dataset.

-
-
Parameters:
-

dataset – input dataset

-
-
Returns:
-

selected dataset.

-
-
-
- -
- -
-
-

data_juicer.ops.selector.topk_specified_field_selector module

-
-
-class data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector(field_key: str = '', top_ratio: ClosedUnitInterval | None = None, topk: PositiveInt | None = None, reverse: bool = True, *args, **kwargs)[source]
-

Bases: Selector

-

Selector to select top samples based on the sorted specified field -value.

-
-
-process(dataset)[source]
-

Dataset –> dataset.

-
-
Parameters:
-

dataset – input dataset

-
-
Returns:
-

selected dataset.

-
-
-
- -
- -
-
-data_juicer.ops.selector.topk_specified_field_selector.to_number(s, reverse=True)[source]
-
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/data_juicer.utils.html b/data_juicer.utils.html deleted file mode 100644 index 836bbdeaa..000000000 --- a/data_juicer.utils.html +++ /dev/null @@ -1,594 +0,0 @@ - - - - - - - data_juicer.utils package — data_juicer 0.1.2 documentation - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -
-

data_juicer.utils package

-
-

Submodules

-
-
-

data_juicer.utils.asset_utils module

-
-
-data_juicer.utils.asset_utils.load_words_asset(words_dir: str, words_type: str)[source]
-

Load words from a asset file named words_type, if not find a valid asset -file, then download it from ASSET_LINKS cached by data_juicer team.

-
-
Parameters:
-
    -
  • words_dir – directory that stores asset file(s)

  • -
  • words_type – name of target words assets

  • -
-
-
Returns:
-

a dict that stores words assets, whose keys are language -names, and the values are lists of words

-
-
-
- -
-
-

data_juicer.utils.cache_utils module

-
-
-

data_juicer.utils.ckpt_utils module

-
-
-class data_juicer.utils.ckpt_utils.CheckpointManager(ckpt_dir, original_process_list, num_proc=1)[source]
-

Bases: object

-

This class is used to save the latest version of dataset to checkpoint -directory or load it from checkpoint directory, a bit like cache management -Rerun the same config will reload the checkpoint and skip ops before it.

-

If any args of operator in process list is changed, all ops will be -rerun from the beginning.

-
-
-check_ckpt()[source]
-

Check if checkpoint is available.

-
-
Returns:
-

True when checkpoint is available, else False

-
-
-
- -
-
-check_ops_to_skip()[source]
-

Check which ops need to be skipped in the process list.

-

If op record list from checkpoint are the same as the prefix -part of process list, then skip these ops and start processing -from the checkpoint. Otherwise, process the original dataset -from scratch.

-
-
Returns:
-

whether to skip somme ops or not

-
-
-
- -
-
-get_left_process_list()[source]
-

Get left process list of ops for processing dataset, when checkpoint is -available, remove some ops from process list, otherwise keep it -unchanged.

-
-
Returns:
-

process list of left ops

-
-
-
- -
-
-load_ckpt()[source]
-

Load dataset from a checkpoint file.

-
-
Returns:
-

a dataset stored in checkpoint file.

-
-
-
- -
-
-record(op_name, op_args)[source]
-

Save op name and args to op record, which is used to compare with -the process list from config to decide if a checkpoint is available.

-
- -
-
-save_ckpt(ds)[source]
-

Save dataset to checkpoint directory and dump processed ops list.

-
-
Parameters:
-

ds – input dataset to save

-
-
-
- -
- -
-
-

data_juicer.utils.file_utils module

-
-
-data_juicer.utils.file_utils.find_files_with_suffix(path: str | Path, suffixes: str | List[str] | Tuple[str] | None = None) List[str][source]
-

Traverse a path to find all files with the specified suffixes.

-
-
Parameters:
-
    -
  • path – path (str/Path): source path

  • -
  • suffixes – specified file suffixes, ‘.txt’ or [‘.txt’, ‘.md’] -etc

  • -
-
-
Returns:
-

list of all files with the specified suffixes

-
-
-
- -
-
-data_juicer.utils.file_utils.is_absolute_path(path: str | Path) bool[source]
-

Check whether input path is a absolute path.

-
-
Parameters:
-

path – input path

-
-
Returns:
-

True means input path is absolute path, False means input -path is a relative path.

-
-
-
- -
-
-

data_juicer.utils.logger_utils module

-
-
-class data_juicer.utils.logger_utils.HiddenPrints[source]
-

Bases: object

-

Define a range that hide the outputs within this range.

-
- -
-
-class data_juicer.utils.logger_utils.StreamToLoguru(level='INFO', caller_names=('datasets', 'logging'))[source]
-

Bases: object

-

Stream object that redirects writes to a logger instance.

-
-
-flush()[source]
-
- -
-
-write(buf)[source]
-
- -
- -
-
-data_juicer.utils.logger_utils.get_caller_name(depth=0)[source]
-

Get caller name by depth.

-
-
Parameters:
-

depth – depth of caller context, use 0 for caller depth.

-
-
Returns:
-

module name of the caller

-
-
-
- -
-
-data_juicer.utils.logger_utils.get_log_file_path()[source]
-

Get the path to the location of the log file.

-
-
Returns:
-

a location of log file.

-
-
-
- -
-
-data_juicer.utils.logger_utils.redirect_sys_output(log_level='INFO')[source]
-

Redirect stdout/stderr to loguru with log level.

-
-
Parameters:
-

log_level – log level string of loguru. Default value: “INFO”.

-
-
-
- -
-
-data_juicer.utils.logger_utils.setup_logger(save_dir, distributed_rank=0, filename='log.txt', mode='o', redirect=True)[source]
-

Setup logger for training and testing.

-
-
Parameters:
-
    -
  • save_dir – location to save log file

  • -
  • distributed_rank – device rank when multi-gpu environment

  • -
  • filename – log file name to save

  • -
  • mode – log file write mode, append or override. default is o.

  • -
  • redirect – whether to redirect system output

  • -
-
-
Returns:
-

logger instance.

-
-
-
- -
-
-

data_juicer.utils.model_utils module

-
-
-data_juicer.utils.model_utils.check_model(model_name, args=(), force=False)[source]
-

Check whether a model exists in MODEL_PATH. If exists, return its full path -Else, download it from cached models links.

-
-
Parameters:
-
    -
  • model_name – a specified model name

  • -
  • args – optional extra args of model.

  • -
  • force – Whether to download model forcefully or not, Sometimes -the model file maybe incomplete for some reason, so need to -download again forcefully.

  • -
-
-
-
- -
-
-data_juicer.utils.model_utils.get_model(model_key, lang='en', model_type='sentencepiece')[source]
-

Get a model or a tokenizer from MODEL_ZOO.

-
-
Parameters:
-

model_key – name of the model or tokenzier

-
-
-
- -
-
-data_juicer.utils.model_utils.prepare_diversity_model(model_name, lang)[source]
-

Prepare diversity model for specific language.

-
-
Parameters:
-
    -
  • model_name – the model name to be loaded.

  • -
  • lang – language of diversity model. Should be one of [“zh”, -“en”]

  • -
-
-
Returns:
-

corresponding diversity model

-
-
-
- -
-
-data_juicer.utils.model_utils.prepare_fasttext_model(model_name)[source]
-

Prepare and load a fasttext model.

-
-
Parameters:
-

model_name – input model name

-
-
Returns:
-

model instance.

-
-
-
- -
-
-data_juicer.utils.model_utils.prepare_huggingface_tokenizer(tokenizer_name)[source]
-

Prepare and load a tokenizer from HuggingFace.

-
-
Parameters:
-

tokenizer_name – input tokenizer name

-
-
Returns:
-

a tokenizer instance.

-
-
-
- -
-
-data_juicer.utils.model_utils.prepare_kenlm_model(model_name, lang)[source]
-

Prepare and load a kenlm model.

-
-
Parameters:
-
    -
  • model_name – input model name in formatting syntax.

  • -
  • lang – language to render model name

  • -
-
-
Returns:
-

model instance.

-
-
-
- -
-
-data_juicer.utils.model_utils.prepare_model(lang='en', model_type='sentencepiece', model_key=None)[source]
-

Prepare and load a model or a tokenizer from MODEL_ZOO.

-
-
Parameters:
-
    -
  • lang – which lang model to load

  • -
  • model_type – model or tokenizer type

  • -
  • model_key – tokenizer name, only used when prepare HuggingFace -tokenizer

  • -
-
-
Returns:
-

a model or tokenizer instance

-
-
-
- -
-
-data_juicer.utils.model_utils.prepare_nltk_model(model_name, lang)[source]
-

Prepare and load a nltk punkt model.

-
-
Parameters:
-
    -
  • model_name – input model name in formatting syntax

  • -
  • lang – language to render model name

  • -
-
-
Returns:
-

model instance.

-
-
-
- -
-
-data_juicer.utils.model_utils.prepare_sentencepiece_model(model_name, lang)[source]
-

Prepare and load a sentencepiece model.

-
-
Parameters:
-
    -
  • model_name – input model name in formatting syntax

  • -
  • lang – language to render model name

  • -
-
-
Returns:
-

model instance.

-
-
-
- -
-
-

data_juicer.utils.registry module

-
-
-class data_juicer.utils.registry.Registry(name: str)[source]
-

Bases: object

-

This class is used to register some modules to registry by a repo -name.

-
-
-get(module_key)[source]
-

Get module named module_key from in current registry. If not found, -return None.

-
-
Parameters:
-

module_key – specified module name

-
-
Returns:
-

module named module_key

-
-
-
- -
-
-list()[source]
-

Logging the list of module in current registry.

-
- -
-
-property modules
-

Get all modules in current registry.

-
-
Returns:
-

a dict storing modules in current registry.

-
-
-
- -
-
-property name
-

Get name of current registry.

-
-
Returns:
-

name of current registry.

-
-
-
- -
-
-register_module(module_name: str | None = None, module_cls: type | None = None, force=False)[source]
-

Register module class object to registry with the specified modulename.

-
-
Parameters:
-
    -
  • module_name – module name

  • -
  • module_cls – module class object

  • -
  • force – Whether to override an existing class with -the same name. Default: False.

  • -
-
-
-

Example

-
>>> registry = Registry()
->>> @registry.register_module()
->>> class TextFormatter:
->>>     pass
-
-
-
>>> class TextFormatter2:
->>>     pass
->>> registry.register_module( module_name='text_formatter2',
-                            module_cls=TextFormatter2)
-
-
-
- -
- -
-
-

Module contents

-
-
- - -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/genindex.html b/genindex.html index 28c4aa24b..97df4d40b 100644 --- a/genindex.html +++ b/genindex.html @@ -40,10 +40,8 @@ @@ -71,1367 +69,8 @@

Index

- A - | B - | C - | D - | E - | F - | G - | H - | I - | J - | K - | L - | M - | N - | O - | P - | R - | S - | T - | U - | W
-

A

- - - -
- -

B

- - -
- -

C

- - - -
- -

D

- - - -
    -
  • - data_juicer - -
  • -
  • - data_juicer.analysis - -
  • -
  • - data_juicer.analysis.column_wise_analysis - -
  • -
  • - data_juicer.analysis.diversity_analysis - -
  • -
  • - data_juicer.analysis.overall_analysis - -
  • -
  • - data_juicer.config - -
  • -
  • - data_juicer.config.config - -
  • -
  • - data_juicer.core - -
  • -
  • - data_juicer.core.analyser - -
  • -
  • - data_juicer.core.data - -
  • -
  • - data_juicer.core.executor - -
  • -
  • - data_juicer.core.exporter - -
  • -
  • - data_juicer.core.tracer - -
  • -
  • - data_juicer.format - -
  • -
  • - data_juicer.format.csv_formatter - -
  • -
  • - data_juicer.format.formatter - -
  • -
  • - data_juicer.format.json_formatter - -
  • -
  • - data_juicer.format.load - -
  • -
  • - data_juicer.format.mixture_formatter - -
  • -
  • - data_juicer.format.parquet_formatter - -
  • -
  • - data_juicer.format.text_formatter - -
  • -
  • - data_juicer.format.tsv_formatter - -
  • -
  • - data_juicer.ops - -
  • -
  • - data_juicer.ops.base_op - -
  • -
  • - data_juicer.ops.common - -
  • -
  • - data_juicer.ops.common.helper_func - -
  • -
  • - data_juicer.ops.common.special_characters - -
  • -
  • - data_juicer.ops.deduplicator - -
  • -
  • - data_juicer.ops.deduplicator.document_deduplicator - -
  • -
  • - data_juicer.ops.deduplicator.document_minhash_deduplicator - -
  • -
  • - data_juicer.ops.deduplicator.document_simhash_deduplicator - -
  • -
  • - data_juicer.ops.filter - -
  • -
  • - data_juicer.ops.filter.alphanumeric_filter - -
  • -
  • - data_juicer.ops.filter.average_line_length_filter - -
  • -
  • - data_juicer.ops.filter.character_repetition_filter - -
  • -
  • - data_juicer.ops.filter.flagged_words_filter - -
  • -
  • - data_juicer.ops.filter.language_id_score_filter - -
  • -
  • - data_juicer.ops.filter.maximum_line_length_filter - -
  • -
  • - data_juicer.ops.filter.perplexity_filter - -
  • -
  • - data_juicer.ops.filter.special_characters_filter - -
  • -
  • - data_juicer.ops.filter.specified_field_filter - -
  • -
- -

E

- - - -
- -

F

- - - -
- -

G

- - - -
- -

H

- - -
- -

I

- - - -
- -

J

- - -
- -

K

- - -
- -

L

- - - -
- -

M

- - - -
- -

N

- - - -
- -

O

- - - -
- -

P

- - - -
- -

R

- - - -
- -

S

- - - -
- -

T

- - - -
- -

U

- - - -
- -

W

- - - -
- diff --git a/index.html b/index.html index a0d58ed81..7ea444540 100644 --- a/index.html +++ b/index.html @@ -16,8 +16,7 @@ - - + @@ -42,10 +41,13 @@ @@ -78,72 +80,6 @@

Welcome to data-juicer’s documentation!

data_juicer

-
@@ -158,9 +94,7 @@

Indices and tables - - +