diff --git a/.doctrees/data_juicer.analysis.doctree b/.doctrees/data_juicer.analysis.doctree index d0e773001..177be1c3d 100644 Binary files a/.doctrees/data_juicer.analysis.doctree and b/.doctrees/data_juicer.analysis.doctree differ diff --git a/.doctrees/data_juicer.core.doctree b/.doctrees/data_juicer.core.doctree index 3dd2879d4..30eb6af9a 100644 Binary files a/.doctrees/data_juicer.core.doctree and b/.doctrees/data_juicer.core.doctree differ diff --git a/.doctrees/data_juicer.format.doctree b/.doctrees/data_juicer.format.doctree index 0d369e7af..a0e703738 100644 Binary files a/.doctrees/data_juicer.format.doctree and b/.doctrees/data_juicer.format.doctree differ diff --git a/.doctrees/data_juicer.ops.common.doctree b/.doctrees/data_juicer.ops.common.doctree index 3b3c006f6..bc2074b92 100644 Binary files a/.doctrees/data_juicer.ops.common.doctree and b/.doctrees/data_juicer.ops.common.doctree differ diff --git a/.doctrees/data_juicer.ops.deduplicator.doctree b/.doctrees/data_juicer.ops.deduplicator.doctree index e189764b6..304898271 100644 Binary files a/.doctrees/data_juicer.ops.deduplicator.doctree and b/.doctrees/data_juicer.ops.deduplicator.doctree differ diff --git a/.doctrees/data_juicer.ops.doctree b/.doctrees/data_juicer.ops.doctree index d0f37495a..3ace95e38 100644 Binary files a/.doctrees/data_juicer.ops.doctree and b/.doctrees/data_juicer.ops.doctree differ diff --git a/.doctrees/data_juicer.ops.filter.doctree b/.doctrees/data_juicer.ops.filter.doctree index bb67dc4fa..193ef3ce9 100644 Binary files a/.doctrees/data_juicer.ops.filter.doctree and b/.doctrees/data_juicer.ops.filter.doctree differ diff --git a/.doctrees/data_juicer.ops.mapper.doctree b/.doctrees/data_juicer.ops.mapper.doctree index 7f796dd8b..35a03ff5b 100644 Binary files a/.doctrees/data_juicer.ops.mapper.doctree and b/.doctrees/data_juicer.ops.mapper.doctree differ diff --git a/.doctrees/data_juicer.ops.selector.doctree b/.doctrees/data_juicer.ops.selector.doctree index cce2ba322..a6c36d698 100644 Binary files a/.doctrees/data_juicer.ops.selector.doctree and b/.doctrees/data_juicer.ops.selector.doctree differ diff --git a/.doctrees/data_juicer.utils.doctree b/.doctrees/data_juicer.utils.doctree index fb149ce2e..8b363f6fb 100644 Binary files a/.doctrees/data_juicer.utils.doctree and b/.doctrees/data_juicer.utils.doctree differ diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle index 77795d23e..5288b3f45 100644 Binary files a/.doctrees/environment.pickle and b/.doctrees/environment.pickle differ diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html index 9f228513d..f06990850 100644 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ b/_modules/data_juicer/analysis/column_wise_analysis.html @@ -124,7 +124,7 @@
dataset better.
"""
- def __init__(self, cfg=None):
+[docs] def __init__(self, cfg=None):
"""
Initialization method.
@@ -131,7 +131,7 @@ Source code for data_juicer.core.analyser
# parsed_res
self.overall_result = None
self.overall_single_plot_path = None
- self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
+ self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
[docs] def run(self, load_data_np=None):
"""
diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html
index 045c9b4f6..5a9fa91e6 100644
--- a/_modules/data_juicer/core/data.html
+++ b/_modules/data_juicer/core/data.html
@@ -144,7 +144,7 @@ Source code for data_juicer.core.data
[docs]class NestedQueryDict(dict):
"""Enhanced dict for better usability."""
- def __init__(self, *args, **kargs):
+[docs] def __init__(self, *args, **kargs):
if len(args) == 1 and isinstance(args[0], Dataset):
# init from another DatasetDict instance
self.__dict__ = copy.copy(args[0].__dict__)
@@ -155,7 +155,7 @@ Source code for data_juicer.core.data
# batched sample, (k & v) are organized by list manner
for k, v in self.items():
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
- self[k] = [NestedQueryDict(item) for item in v]
+ self[k] = [NestedQueryDict(item) for item in v]
def __getitem__(self, key):
return nested_query(self, key)
@@ -164,13 +164,13 @@ Source code for data_juicer.core.data
[docs]class NestedDatasetDict(DatasetDict):
"""Enhanced HuggingFace-DatasetDict for better usability and efficiency."""
- def __init__(self, *args, **kargs):
+[docs] def __init__(self, *args, **kargs):
if len(args) == 1 and isinstance(args[0], Dataset):
# init from another DatasetDict instance
self.__dict__ = copy.copy(args[0].__dict__)
else:
# init from scratch
- super().__init__(*args, **kargs)
+ super().__init__(*args, **kargs)
def __getitem__(self, key):
return nested_query(self, key)
@@ -189,7 +189,7 @@ Source code for data_juicer.core.data
[docs]class NestedDataset(Dataset):
"""Enhanced HuggingFace-Dataset for better usability and efficiency."""
- def __init__(self, *args, **kargs):
+[docs] def __init__(self, *args, **kargs):
if len(args) == 1 and isinstance(args[0], Dataset):
# init from another Dataset instance
self.__dict__ = copy.copy(args[0].__dict__)
@@ -197,7 +197,7 @@ Source code for data_juicer.core.data
# init from scratch
super().__init__(*args, **kargs)
- self.need_to_cleanup_caches = not is_caching_enabled()
+ self.need_to_cleanup_caches = not is_caching_enabled()
def __getitem__(self, key):
if isinstance(key, str):
diff --git a/_modules/data_juicer/core/executor.html b/_modules/data_juicer/core/executor.html
index 14d3ed34e..de867852d 100644
--- a/_modules/data_juicer/core/executor.html
+++ b/_modules/data_juicer/core/executor.html
@@ -94,7 +94,7 @@ Source code for data_juicer.core.executor
ops in the config file in order and generate a processed dataset.
"""
- def __init__(self, cfg=None):
+[docs] def __init__(self, cfg=None):
"""
Initialization method.
@@ -149,7 +149,7 @@ Source code for data_juicer.core.executor
self.op_list_to_trace = self.cfg.op_list_to_trace
if len(self.cfg.op_list_to_trace) == 0:
logger.info('Trace for all ops.')
- self.op_list_to_trace = set(OPERATORS.modules.keys())
+ self.op_list_to_trace = set(OPERATORS.modules.keys())
[docs] def run(self, load_data_np=None):
"""
diff --git a/_modules/data_juicer/core/exporter.html b/_modules/data_juicer/core/exporter.html
index db8947990..84cbf9dc8 100644
--- a/_modules/data_juicer/core/exporter.html
+++ b/_modules/data_juicer/core/exporter.html
@@ -86,7 +86,7 @@ Source code for data_juicer.core.exporter
GiB = 2**30 # 1024*1024*1024
TiB = 2**40 # 1024*1024*1024*1024
- def __init__(self,
+[docs] def __init__(self,
export_path,
export_shard_size=0,
export_in_parallel=True,
@@ -139,7 +139,7 @@ Source code for data_juicer.core.exporter
logger.warning(f'The export_shard_size [{self.max_shard_size_str}]'
f' is larger than 1TiB. It might generate large '
f'single shard file and make loading and exporting '
- f'slower.')
+ f'slower.')
def _get_suffix(self, export_path):
"""
diff --git a/_modules/data_juicer/core/ray_executor.html b/_modules/data_juicer/core/ray_executor.html
index 8abcea806..805dba302 100644
--- a/_modules/data_juicer/core/ray_executor.html
+++ b/_modules/data_juicer/core/ray_executor.html
@@ -90,7 +90,7 @@ Source code for data_juicer.core.ray_executor
2. Advanced functions such as checkpoint, tracer are not supported.
"""
- def __init__(self, cfg=None):
+[docs] def __init__(self, cfg=None):
"""
Initialization method.
@@ -104,7 +104,7 @@ Source code for data_juicer.core.ray_executor
# init ray
logger.info('Initing Ray ...')
ray.init(self.cfg.ray_address)
- self.process_list = self.cfg.process
+ self.process_list = self.cfg.process
[docs] def run(self, load_data_np=None):
diff --git a/_modules/data_juicer/core/tracer.html b/_modules/data_juicer/core/tracer.html
index 555d13cfa..6c11165d0 100644
--- a/_modules/data_juicer/core/tracer.html
+++ b/_modules/data_juicer/core/tracer.html
@@ -84,7 +84,7 @@ Source code for data_juicer.core.tracer
The comparison results will be stored in the work directory.
"""
- def __init__(self, work_dir, show_num=10):
+[docs] def __init__(self, work_dir, show_num=10):
"""
Initialization method.
@@ -96,7 +96,7 @@ Source code for data_juicer.core.tracer
self.work_dir = os.path.join(work_dir, 'trace')
if not os.path.exists(self.work_dir):
os.makedirs(self.work_dir)
- self.show_num = show_num
+ self.show_num = show_num
[docs] def trace_mapper(self, op_name: str, previous_ds: Dataset,
processed_ds: Dataset, text_key: str):
diff --git a/_modules/data_juicer/format/csv_formatter.html b/_modules/data_juicer/format/csv_formatter.html
index 338d1d688..c913fd93b 100644
--- a/_modules/data_juicer/format/csv_formatter.html
+++ b/_modules/data_juicer/format/csv_formatter.html
@@ -81,7 +81,7 @@ Source code for data_juicer.format.csv_formatter
"""
SUFFIXES = ['.csv']
- def __init__(self, dataset_path, suffixes=None, **kwargs):
+[docs] def __init__(self, dataset_path, suffixes=None, **kwargs):
"""
Initialization method.
@@ -94,7 +94,7 @@ Source code for data_juicer.format.csv_formatter
suffixes=suffixes if suffixes else self.SUFFIXES,
type='csv',
**kwargs,
- )
+ )
diff --git a/_modules/data_juicer/format/formatter.html b/_modules/data_juicer/format/formatter.html
index 85379962d..d5f72b398 100644
--- a/_modules/data_juicer/format/formatter.html
+++ b/_modules/data_juicer/format/formatter.html
@@ -94,7 +94,7 @@ Source code for data_juicer.format.formatter
"""The class is used to load a dataset from local files or local
directory."""
- def __init__(
+[docs] def __init__(
self,
dataset_path: str,
type: str,
@@ -120,7 +120,7 @@ Source code for data_juicer.format.formatter
self.kwargs = kwargs
self.text_keys = text_keys
self.data_files = find_files_with_suffix(dataset_path, suffixes)
- self.add_suffix = add_suffix
+ self.add_suffix = add_suffix
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset:
"""
@@ -155,7 +155,7 @@ Source code for data_juicer.format.formatter
"""The class is used to load a dataset from repository of huggingface
hub."""
- def __init__(self,
+[docs] def __init__(self,
dataset_path: str,
text_keys: List[str] = None,
**kwargs):
@@ -169,7 +169,7 @@ Source code for data_juicer.format.formatter
"""
self.path = dataset_path
self.text_keys = text_keys
- self.kwargs = kwargs
+ self.kwargs = kwargs
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset:
"""
diff --git a/_modules/data_juicer/format/json_formatter.html b/_modules/data_juicer/format/json_formatter.html
index adae4804b..5e2817e66 100644
--- a/_modules/data_juicer/format/json_formatter.html
+++ b/_modules/data_juicer/format/json_formatter.html
@@ -81,7 +81,7 @@ Source code for data_juicer.format.json_formatter
"""
SUFFIXES = ['.json', '.jsonl', '.jsonl.zst']
- def __init__(self, dataset_path, suffixes=None, **kwargs):
+[docs] def __init__(self, dataset_path, suffixes=None, **kwargs):
"""
Initialization method.
@@ -94,7 +94,7 @@ Source code for data_juicer.format.json_formatter
suffixes=suffixes if suffixes else self.SUFFIXES,
type='json',
**kwargs,
- )
+ )
diff --git a/_modules/data_juicer/format/mixture_formatter.html b/_modules/data_juicer/format/mixture_formatter.html
index f38cbbd1a..003f8e53a 100644
--- a/_modules/data_juicer/format/mixture_formatter.html
+++ b/_modules/data_juicer/format/mixture_formatter.html
@@ -83,7 +83,7 @@ Source code for data_juicer.format.mixture_formatter
every dataset and merging them, and then exports the merged datasset as a
new mixed dataset."""
- def __init__(self,
+[docs] def __init__(self,
dataset_path: str,
suffixes: Union[str, List[str], Tuple[str]] = None,
text_keys=None,
@@ -109,7 +109,7 @@ Source code for data_juicer.format.mixture_formatter
text_keys=text_keys,
add_suffix=add_suffix,
**kwargs) for data_prefix in data_prefixes
- ]
+ ]
def _get_weight(self, data_prefix):
"""
diff --git a/_modules/data_juicer/format/parquet_formatter.html b/_modules/data_juicer/format/parquet_formatter.html
index c7608e2da..8af8186f7 100644
--- a/_modules/data_juicer/format/parquet_formatter.html
+++ b/_modules/data_juicer/format/parquet_formatter.html
@@ -81,7 +81,7 @@ Source code for data_juicer.format.parquet_formatter
"""
SUFFIXES = ['.parquet']
- def __init__(self, dataset_path, suffixes=None, **kwargs):
+[docs] def __init__(self, dataset_path, suffixes=None, **kwargs):
"""
Initialization method.
@@ -94,7 +94,7 @@ Source code for data_juicer.format.parquet_formatter
suffixes=suffixes if suffixes else self.SUFFIXES,
type='parquet',
**kwargs,
- )
+ )
diff --git a/_modules/data_juicer/format/text_formatter.html b/_modules/data_juicer/format/text_formatter.html
index 4140a5925..28fffda7f 100644
--- a/_modules/data_juicer/format/text_formatter.html
+++ b/_modules/data_juicer/format/text_formatter.html
@@ -143,7 +143,7 @@ Source code for data_juicer.format.text_formatter
'.m', '.smali'
]
- def __init__(self,
+[docs] def __init__(self,
dataset_path,
suffixes=None,
add_suffix=False,
@@ -165,7 +165,7 @@ Source code for data_juicer.format.text_formatter
**kwargs,
)
self.dataset_path = dataset_path
- self.add_suffix = add_suffix
+ self.add_suffix = add_suffix
[docs] def load_dataset(self, num_proc: int = 1) -> Dataset:
"""
diff --git a/_modules/data_juicer/format/tsv_formatter.html b/_modules/data_juicer/format/tsv_formatter.html
index 7e52a8fa5..c37226a65 100644
--- a/_modules/data_juicer/format/tsv_formatter.html
+++ b/_modules/data_juicer/format/tsv_formatter.html
@@ -81,7 +81,7 @@ Source code for data_juicer.format.tsv_formatter
"""
SUFFIXES = ['.tsv']
- def __init__(self, dataset_path, suffixes=None, **kwargs):
+[docs] def __init__(self, dataset_path, suffixes=None, **kwargs):
"""
Initialization method.
@@ -95,7 +95,7 @@ Source code for data_juicer.format.tsv_formatter
type='csv',
delimiter='\t',
**kwargs,
- )
+ )
diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html
index c42d12535..ec4494caa 100644
--- a/_modules/data_juicer/ops/base_op.html
+++ b/_modules/data_juicer/ops/base_op.html
@@ -76,7 +76,7 @@ Source code for data_juicer.ops.base_op
[docs]class Mapper:
- def __init__(self, text_key: str = None):
+[docs] def __init__(self, text_key: str = None):
"""
Base class that conducts text editing.
@@ -90,7 +90,7 @@ Source code for data_juicer.ops.base_op
self.process = wrap_func_with_nested_access(self.process)
# In default, it's a normal OP instead of batched OP
- self._batched_op = False
+ self._batched_op = False
[docs] def process(self, sample):
"""
@@ -107,7 +107,7 @@ Source code for data_juicer.ops.base_op
[docs]class Filter:
- def __init__(self, text_key: str = None):
+[docs] def __init__(self, text_key: str = None):
"""
Base class that removes specific info.
@@ -119,7 +119,7 @@ Source code for data_juicer.ops.base_op
self.text_key = text_key
from data_juicer.core.data import wrap_func_with_nested_access
self.process = wrap_func_with_nested_access(self.process)
- self.compute_stats = wrap_func_with_nested_access(self.compute_stats)
+ self.compute_stats = wrap_func_with_nested_access(self.compute_stats)
[docs] def compute_stats(self, sample, context=False):
"""
@@ -145,7 +145,7 @@ Source code for data_juicer.ops.base_op
[docs]class Deduplicator:
- def __init__(self, text_key: str = None):
+[docs] def __init__(self, text_key: str = None):
"""
Base class that conducts deduplication.
@@ -157,7 +157,7 @@ Source code for data_juicer.ops.base_op
self.text_key = text_key
from data_juicer.core.data import wrap_func_with_nested_access
self.process = wrap_func_with_nested_access(self.process)
- self.compute_hash = wrap_func_with_nested_access(self.compute_hash)
+ self.compute_hash = wrap_func_with_nested_access(self.compute_hash)
[docs] def compute_hash(self, sample):
"""
@@ -182,7 +182,7 @@ Source code for data_juicer.ops.base_op
[docs]class Selector:
- def __init__(self, text_key: str = None):
+[docs] def __init__(self, text_key: str = None):
"""
Base class that conducts selection in dataset-level.
@@ -193,7 +193,7 @@ Source code for data_juicer.ops.base_op
text_key = 'text'
self.text_key = text_key
from data_juicer.core.data import wrap_func_with_nested_access
- self.process = wrap_func_with_nested_access(self.process)
+ self.process = wrap_func_with_nested_access(self.process)
[docs] def process(self, dataset):
"""
diff --git a/_modules/data_juicer/ops/common/helper_func.html b/_modules/data_juicer/ops/common/helper_func.html
index f1b319fbe..865499308 100644
--- a/_modules/data_juicer/ops/common/helper_func.html
+++ b/_modules/data_juicer/ops/common/helper_func.html
@@ -79,9 +79,9 @@ Source code for data_juicer.ops.common.helper_func
[docs]class UnionFind:
- def __init__(self):
+[docs] def __init__(self):
"""Initialization method."""
- self.parent: Dict[int, int] = {}
+ self.parent: Dict[int, int] = {}
[docs] def find(self, x):
if x not in self.parent:
diff --git a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html
index 8596040c7..6a619f12f 100644
--- a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html
@@ -93,7 +93,7 @@ Source code for data_juicer.ops.deduplicator.document_deduplicator
Using md5 hash to deduplicate samples.
"""
- def __init__(self,
+[docs] def __init__(self,
lowercase: bool = False,
ignore_non_character: bool = False,
*args,
@@ -111,7 +111,7 @@ Source code for data_juicer.ops.deduplicator.document_deduplicator
self.lowercase = lowercase
self.remove_non_character_regex = re.compile(
f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605
- ) if ignore_non_character else None
+ ) if ignore_non_character else None
[docs] def compute_hash(self, sample):
"""
diff --git a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html
index 01ea211d4..72dfde50c 100644
--- a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html
@@ -169,7 +169,7 @@ Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator
kept in the final dataset.
"""
- def __init__(
+[docs] def __init__(
self,
tokenization: str = 'space',
window_size: PositiveInt = 5,
@@ -251,7 +251,7 @@ Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator
gen.randint(0, MERSENNE_PRIME, dtype=np.uint64),
) for _ in range(self.num_permutation)],
dtype=np.uint64,
- ).T
+ ).T
[docs] def compute_hash(self, sample):
"""
diff --git a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html
index 85206bbe8..d393a4ba7 100644
--- a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html
@@ -135,7 +135,7 @@ Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator
class DocumentSimhashDeduplicator(Deduplicator):
"""Deduplicator to deduplicate samples at document-level using SimHash."""
- def __init__(self,
+[docs] def __init__(self,
tokenization: str = 'space',
window_size: PositiveInt = 6,
lowercase: bool = True,
@@ -181,7 +181,7 @@ Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator
# about deduplication
self.num_blocks = num_blocks
- self.hamming_distance = hamming_distance
+ self.hamming_distance = hamming_distance
[docs] def compute_hash(self, sample):
"""
diff --git a/_modules/data_juicer/ops/filter/alphanumeric_filter.html b/_modules/data_juicer/ops/filter/alphanumeric_filter.html
index 550b89c6f..42150f84c 100644
--- a/_modules/data_juicer/ops/filter/alphanumeric_filter.html
+++ b/_modules/data_juicer/ops/filter/alphanumeric_filter.html
@@ -85,7 +85,7 @@ Source code for data_juicer.ops.filter.alphanumeric_filter
"""Filter to keep samples with alphabet/numeric ratio within a specific
range."""
- def __init__(self,
+[docs] def __init__(self,
tokenization: bool = False,
min_ratio: float = 0.25,
max_ratio: PositiveFloat = sys.maxsize,
@@ -116,7 +116,7 @@ Source code for data_juicer.ops.filter.alphanumeric_filter
if tokenization:
self.model_key = prepare_model(
model_type='huggingface',
- model_key='EleutherAI/pythia-6.9b-deduped')
+ model_key='EleutherAI/pythia-6.9b-deduped')
[docs] def compute_stats(self, sample):
if self.tokenization:
diff --git a/_modules/data_juicer/ops/filter/average_line_length_filter.html b/_modules/data_juicer/ops/filter/average_line_length_filter.html
index a224804e8..4d58914b2 100644
--- a/_modules/data_juicer/ops/filter/average_line_length_filter.html
+++ b/_modules/data_juicer/ops/filter/average_line_length_filter.html
@@ -85,7 +85,7 @@ Source code for data_juicer.ops.filter.average_line_length_filter
"""Filter to keep samples with average line length within a specific
range."""
- def __init__(self,
+[docs] def __init__(self,
min_len: PositiveInt = 10,
max_len: PositiveInt = sys.maxsize,
*args,
@@ -104,7 +104,7 @@ Source code for data_juicer.ops.filter.average_line_length_filter
"""
super().__init__(*args, **kwargs)
self.min_len = min_len
- self.max_len = max_len
+ self.max_len = max_len
[docs] def compute_stats(self, sample, context=False):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/character_repetition_filter.html b/_modules/data_juicer/ops/filter/character_repetition_filter.html
index db03058bc..40e1b1c3f 100644
--- a/_modules/data_juicer/ops/filter/character_repetition_filter.html
+++ b/_modules/data_juicer/ops/filter/character_repetition_filter.html
@@ -86,7 +86,7 @@ Source code for data_juicer.ops.filter.character_repetition_filter
"""Filter to keep samples with char-level n-gram repetition ratio within a
\ specific range."""
- def __init__(self,
+[docs] def __init__(self,
rep_len: PositiveInt = 10,
min_ratio: ClosedUnitInterval = 0.0,
max_ratio: ClosedUnitInterval = 0.5,
@@ -108,7 +108,7 @@ Source code for data_juicer.ops.filter.character_repetition_filter
super().__init__(*args, **kwargs)
self.n = rep_len
self.min_ratio = min_ratio
- self.max_ratio = max_ratio
+ self.max_ratio = max_ratio
[docs] def compute_stats(self, sample):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/flagged_words_filter.html b/_modules/data_juicer/ops/filter/flagged_words_filter.html
index 39bce2dc8..8be147f2a 100644
--- a/_modules/data_juicer/ops/filter/flagged_words_filter.html
+++ b/_modules/data_juicer/ops/filter/flagged_words_filter.html
@@ -91,7 +91,7 @@ Source code for data_juicer.ops.filter.flagged_words_filter
"""Filter to keep samples with flagged-word ratio less than a specific max
value."""
- def __init__(self,
+[docs] def __init__(self,
lang: str = 'en',
tokenization: bool = False,
max_ratio: ClosedUnitInterval = 0.045,
@@ -138,7 +138,7 @@ Source code for data_juicer.ops.filter.flagged_words_filter
]
if tokenization:
self.model_key = prepare_model(lang=lang,
- model_type='sentencepiece')
+ model_type='sentencepiece')
[docs] def compute_stats(self, sample, context=False):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/language_id_score_filter.html b/_modules/data_juicer/ops/filter/language_id_score_filter.html
index ba90853b8..0cbcc1d89 100644
--- a/_modules/data_juicer/ops/filter/language_id_score_filter.html
+++ b/_modules/data_juicer/ops/filter/language_id_score_filter.html
@@ -83,7 +83,7 @@ Source code for data_juicer.ops.filter.language_id_score_filter
"""Filter to keep samples in a specific language with confidence score
larger than a specific min value."""
- def __init__(self,
+[docs] def __init__(self,
lang: str = '',
min_score: ClosedUnitInterval = 0.8,
*args,
@@ -100,7 +100,7 @@ Source code for data_juicer.ops.filter.language_id_score_filter
super().__init__(*args, **kwargs)
self.lang = lang
self.min_score = min_score
- self.model_key = prepare_model(lang=lang, model_type='fasttext')
+ self.model_key = prepare_model(lang=lang, model_type='fasttext')
[docs] def compute_stats(self, sample):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html b/_modules/data_juicer/ops/filter/maximum_line_length_filter.html
index ba53aea86..f8977679c 100644
--- a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html
+++ b/_modules/data_juicer/ops/filter/maximum_line_length_filter.html
@@ -85,7 +85,7 @@ Source code for data_juicer.ops.filter.maximum_line_length_filter
"""Filter to keep samples with maximum line length within a specific
range."""
- def __init__(self,
+[docs] def __init__(self,
min_len: PositiveInt = 10,
max_len: PositiveInt = sys.maxsize,
*args,
@@ -104,7 +104,7 @@ Source code for data_juicer.ops.filter.maximum_line_length_filter
"""
super().__init__(*args, **kwargs)
self.min_len = min_len
- self.max_len = max_len
+ self.max_len = max_len
[docs] def compute_stats(self, sample, context=False):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/perplexity_filter.html b/_modules/data_juicer/ops/filter/perplexity_filter.html
index ebaa84b09..e91dfdeab 100644
--- a/_modules/data_juicer/ops/filter/perplexity_filter.html
+++ b/_modules/data_juicer/ops/filter/perplexity_filter.html
@@ -89,7 +89,7 @@ Source code for data_juicer.ops.filter.perplexity_filter
"""Filter to keep samples with perplexity score less than a specific max
value."""
- def __init__(self,
+[docs] def __init__(self,
lang: str = 'en',
max_ppl: PositiveFloat = 1500,
*args,
@@ -108,7 +108,7 @@ Source code for data_juicer.ops.filter.perplexity_filter
self.lang = lang
self.sp_model_key = prepare_model(lang=lang,
model_type='sentencepiece')
- self.kl_model_key = prepare_model(lang=lang, model_type='kenlm')
+ self.kl_model_key = prepare_model(lang=lang, model_type='kenlm')
[docs] def compute_stats(self, sample, context=False):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/special_characters_filter.html b/_modules/data_juicer/ops/filter/special_characters_filter.html
index 002b6d2c7..b88e299f8 100644
--- a/_modules/data_juicer/ops/filter/special_characters_filter.html
+++ b/_modules/data_juicer/ops/filter/special_characters_filter.html
@@ -86,7 +86,7 @@ Source code for data_juicer.ops.filter.special_characters_filter
"""Filter to keep samples with special-char ratio within a specific
range."""
- def __init__(self,
+[docs] def __init__(self,
min_ratio: ClosedUnitInterval = 0.0,
max_ratio: ClosedUnitInterval = 0.25,
*args,
@@ -105,7 +105,7 @@ Source code for data_juicer.ops.filter.special_characters_filter
"""
super().__init__(*args, **kwargs)
self.min_ratio = min_ratio
- self.max_ratio = max_ratio
+ self.max_ratio = max_ratio
[docs] def compute_stats(self, sample):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/specified_field_filter.html b/_modules/data_juicer/ops/filter/specified_field_filter.html
index b8099599c..cfa325292 100644
--- a/_modules/data_juicer/ops/filter/specified_field_filter.html
+++ b/_modules/data_juicer/ops/filter/specified_field_filter.html
@@ -83,7 +83,7 @@ Source code for data_juicer.ops.filter.specified_field_filter
specified target value, the sample will be filtered.
"""
- def __init__(self,
+[docs] def __init__(self,
field_key: str = '',
target_value: Union[List, Tuple] = [],
*args,
@@ -102,7 +102,7 @@ Source code for data_juicer.ops.filter.specified_field_filter
"""
super().__init__(*args, **kwargs)
self.field_key = field_key
- self.target_value = target_value
+ self.target_value = target_value
diff --git a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html b/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html
index 20723d1d8..d7969cb19 100644
--- a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html
+++ b/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html
@@ -93,7 +93,7 @@ Source code for data_juicer.ops.filter.specified_numeric_field_filter
specified range, the sample will be filtered.
"""
- def __init__(self,
+[docs] def __init__(self,
field_key: str = '',
min_value: float = -sys.maxsize,
max_value: float = sys.maxsize,
@@ -118,7 +118,7 @@ Source code for data_juicer.ops.filter.specified_numeric_field_filter
super().__init__(*args, **kwargs)
self.field_key = field_key
self.min_value = min_value
- self.max_value = max_value
+ self.max_value = max_value
diff --git a/_modules/data_juicer/ops/filter/stopwords_filter.html b/_modules/data_juicer/ops/filter/stopwords_filter.html
index 405113a62..30773510a 100644
--- a/_modules/data_juicer/ops/filter/stopwords_filter.html
+++ b/_modules/data_juicer/ops/filter/stopwords_filter.html
@@ -91,7 +91,7 @@ Source code for data_juicer.ops.filter.stopwords_filter
"""Filter to keep samples with stopword ratio larger than a specific min
value."""
- def __init__(self,
+[docs] def __init__(self,
lang: str = 'en',
tokenization: bool = False,
min_ratio: ClosedUnitInterval = 0.3,
@@ -136,7 +136,7 @@ Source code for data_juicer.ops.filter.stopwords_filter
]
if tokenization:
self.model_key = prepare_model(lang=lang,
- model_type='sentencepiece')
+ model_type='sentencepiece')
[docs] def compute_stats(self, sample, context=False):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/suffix_filter.html b/_modules/data_juicer/ops/filter/suffix_filter.html
index 0925d4988..43f502e27 100644
--- a/_modules/data_juicer/ops/filter/suffix_filter.html
+++ b/_modules/data_juicer/ops/filter/suffix_filter.html
@@ -80,7 +80,7 @@ Source code for data_juicer.ops.filter.suffix_filter
class SuffixFilter(Filter):
"""Filter to keep samples with specified suffix."""
- def __init__(self,
+[docs] def __init__(self,
suffixes: Union[str, List[str], Tuple[str]] = [],
*args,
**kwargs):
@@ -98,7 +98,7 @@ Source code for data_juicer.ops.filter.suffix_filter
elif isinstance(suffixes, str):
self.suffixes = [suffixes]
else:
- self.suffixes = suffixes
+ self.suffixes = suffixes
diff --git a/_modules/data_juicer/ops/filter/text_length_filter.html b/_modules/data_juicer/ops/filter/text_length_filter.html
index 936f4ccf5..7ed079cd8 100644
--- a/_modules/data_juicer/ops/filter/text_length_filter.html
+++ b/_modules/data_juicer/ops/filter/text_length_filter.html
@@ -83,7 +83,7 @@ Source code for data_juicer.ops.filter.text_length_filter
"""Filter to keep samples with total text length within a specific
range."""
- def __init__(self,
+[docs] def __init__(self,
min_len: PositiveInt = 10,
max_len: PositiveInt = sys.maxsize,
*args,
@@ -102,7 +102,7 @@ Source code for data_juicer.ops.filter.text_length_filter
"""
super().__init__(*args, **kwargs)
self.min_len = min_len
- self.max_len = max_len
+ self.max_len = max_len
[docs] def compute_stats(self, sample):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/token_num_filter.html b/_modules/data_juicer/ops/filter/token_num_filter.html
index 5cba586b1..49a7af726 100644
--- a/_modules/data_juicer/ops/filter/token_num_filter.html
+++ b/_modules/data_juicer/ops/filter/token_num_filter.html
@@ -85,7 +85,7 @@ Source code for data_juicer.ops.filter.token_num_filter
"""Filter to keep samples with total token number within a specific
range."""
- def __init__(self,
+[docs] def __init__(self,
hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped',
min_num: PositiveInt = 10,
max_num: PositiveInt = sys.maxsize,
@@ -109,7 +109,7 @@ Source code for data_juicer.ops.filter.token_num_filter
self.max_num = max_num
self.hf_tokenizer = hf_tokenizer
self.model_key = prepare_model(model_type='huggingface',
- model_key=hf_tokenizer)
+ model_key=hf_tokenizer)
[docs] def compute_stats(self, sample):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/word_num_filter.html b/_modules/data_juicer/ops/filter/word_num_filter.html
index b47df1c8a..04e5f048d 100644
--- a/_modules/data_juicer/ops/filter/word_num_filter.html
+++ b/_modules/data_juicer/ops/filter/word_num_filter.html
@@ -88,7 +88,7 @@ Source code for data_juicer.ops.filter.word_num_filter
"""Filter to keep samples with total words number within a specific
range."""
- def __init__(self,
+[docs] def __init__(self,
lang: str = 'en',
tokenization: bool = False,
min_num: PositiveInt = 10,
@@ -117,7 +117,7 @@ Source code for data_juicer.ops.filter.word_num_filter
if tokenization:
self.model_key = prepare_model(lang=lang,
- model_type='sentencepiece')
+ model_type='sentencepiece')
[docs] def compute_stats(self, sample, context=False):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/filter/word_repetition_filter.html b/_modules/data_juicer/ops/filter/word_repetition_filter.html
index 0b789d646..5ecdcedab 100644
--- a/_modules/data_juicer/ops/filter/word_repetition_filter.html
+++ b/_modules/data_juicer/ops/filter/word_repetition_filter.html
@@ -90,7 +90,7 @@ Source code for data_juicer.ops.filter.word_repetition_filter
"""Filter to keep samples with word-level n-gram repetition ratio within a
\ specific range."""
- def __init__(self,
+[docs] def __init__(self,
lang: str = 'en',
tokenization: bool = False,
rep_len: PositiveInt = 10,
@@ -122,7 +122,7 @@ Source code for data_juicer.ops.filter.word_repetition_filter
if tokenization:
self.model_key = prepare_model(lang=lang,
- model_type='sentencepiece')
+ model_type='sentencepiece')
[docs] def compute_stats(self, sample, context=False):
# check if it's computed already
diff --git a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html b/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html
index 5b2467929..30e3a5c79 100644
--- a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html
+++ b/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html
@@ -83,7 +83,7 @@ Source code for data_juicer.ops.mapper.clean_copyright_mapper
"""Mapper to clean copyright comments at the beginning of the text
samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
@@ -92,7 +92,7 @@ Source code for data_juicer.ops.mapper.clean_copyright_mapper
"""
super().__init__(*args, **kwargs)
self.pat = re.compile('/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/')
- self.cpat = re.compile('copyright', re.IGNORECASE)
+ self.cpat = re.compile('copyright', re.IGNORECASE)
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/clean_email_mapper.html b/_modules/data_juicer/ops/mapper/clean_email_mapper.html
index 518d07904..1c8ef5726 100644
--- a/_modules/data_juicer/ops/mapper/clean_email_mapper.html
+++ b/_modules/data_juicer/ops/mapper/clean_email_mapper.html
@@ -78,7 +78,7 @@ Source code for data_juicer.ops.mapper.clean_email_mapper
class CleanEmailMapper(Mapper):
"""Mapper to clean email in text samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
@@ -86,7 +86,7 @@ Source code for data_juicer.ops.mapper.clean_email_mapper
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
- self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+'
+ self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+'
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/clean_html_mapper.html b/_modules/data_juicer/ops/mapper/clean_html_mapper.html
index 6805312be..1de758f20 100644
--- a/_modules/data_juicer/ops/mapper/clean_html_mapper.html
+++ b/_modules/data_juicer/ops/mapper/clean_html_mapper.html
@@ -82,14 +82,14 @@ Source code for data_juicer.ops.mapper.clean_html_mapper
class CleanHtmlMapper(Mapper):
"""Mapper to clean html code in text samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
- super().__init__(*args, **kwargs)
+ super().__init__(*args, **kwargs)
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html b/_modules/data_juicer/ops/mapper/clean_ip_mapper.html
index a78962f15..5c1afded5 100644
--- a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html
+++ b/_modules/data_juicer/ops/mapper/clean_ip_mapper.html
@@ -78,7 +78,7 @@ Source code for data_juicer.ops.mapper.clean_ip_mapper
class CleanIpMapper(Mapper):
"""Mapper to clean ipv4 and ipv6 address in text samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
@@ -91,7 +91,7 @@ Source code for data_juicer.ops.mapper.clean_ip_mapper
self.pattern += r'(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))'
self.pattern += r'{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|'
self.pattern += r'(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|'
- self.pattern += r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}' # ipv6
+ self.pattern += r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}' # ipv6
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/clean_links_mapper.html b/_modules/data_juicer/ops/mapper/clean_links_mapper.html
index 48476125d..a7cd5811b 100644
--- a/_modules/data_juicer/ops/mapper/clean_links_mapper.html
+++ b/_modules/data_juicer/ops/mapper/clean_links_mapper.html
@@ -81,7 +81,7 @@ Source code for data_juicer.ops.mapper.clean_links_mapper
class CleanLinksMapper(Mapper):
"""Mapper to clean links like http/https/ftp in text samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
@@ -96,7 +96,7 @@ Source code for data_juicer.ops.mapper.clean_links_mapper
self.pattern += r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))'
self.pattern += r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|'
self.pattern += r'[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])'
- self.pattern += r')'
+ self.pattern += r')'
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html b/_modules/data_juicer/ops/mapper/expand_macro_mapper.html
index 91b5dc91e..2d11c165b 100644
--- a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html
+++ b/_modules/data_juicer/ops/mapper/expand_macro_mapper.html
@@ -83,14 +83,14 @@ Source code for data_juicer.ops.mapper.expand_macro_mapper
"""Mapper to expand macro definitions in the document body of Latex
samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
- super().__init__(*args, **kwargs)
+ super().__init__(*args, **kwargs)
def _build_non_arg_macros_dict(self, file_content):
# regex for extracting \newcommand macros without arguments
diff --git a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html b/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html
index def7fdfb0..55b4c8923 100644
--- a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html
+++ b/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html
@@ -78,14 +78,14 @@ Source code for data_juicer.ops.mapper.fix_unicode_mapper
class FixUnicodeMapper(Mapper):
"""Mapper to fix unicode errors in text samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
- super().__init__(*args, **kwargs)
+ super().__init__(*args, **kwargs)
[docs] def process(self, sample):
sample[self.text_key] = ftfy.fix_text(sample[self.text_key])
diff --git a/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html b/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html
index d1dad3ad2..66ef88958 100644
--- a/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html
+++ b/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html
@@ -85,7 +85,7 @@ Source code for data_juicer.ops.mapper.nlpaug_en_mapper
class NlpaugEnMapper(Mapper):
"""Mapper to simply augment samples in English based on nlpaug library."""
- def __init__(self,
+[docs] def __init__(self,
sequential: bool = False,
aug_num: int = 1,
delete_random_word: bool = False,
@@ -180,7 +180,7 @@ Source code for data_juicer.ops.mapper.nlpaug_en_mapper
if self.sequential:
self.aug = naf.Sequential(aug_pipeline)
else:
- self.aug = aug_pipeline
+ self.aug = aug_pipeline
[docs] def process(self, samples):
# no augmentation methods are opened
diff --git a/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html b/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html
index 8501832d4..cce5ec9c6 100644
--- a/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html
+++ b/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html
@@ -81,7 +81,7 @@ Source code for data_juicer.ops.mapper.nlpcda_zh_mapper
class NlpcdaZhMapper(Mapper):
"""Mapper to simply augment samples in Chinese based on nlpcda library."""
- def __init__(self,
+[docs] def __init__(self,
sequential: bool = False,
aug_num: int = 1,
replace_similar_word: bool = False,
@@ -185,7 +185,7 @@ Source code for data_juicer.ops.mapper.nlpcda_zh_mapper
if not self.sequential or len(self.aug_pipeline) == 0 \
else 2
self.aug_pipeline.append(
- nlpcda.EquivalentChar(create_num=create_num))
+ nlpcda.EquivalentChar(create_num=create_num))
[docs] def process(self, samples):
# no augmentation methods are opened
diff --git a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html b/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html
index 5cb8e8d7d..05b5c7104 100644
--- a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html
+++ b/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html
@@ -81,7 +81,7 @@ Source code for data_juicer.ops.mapper.punctuation_normalization_mapper
"""Mapper to normalize unicode punctuations to English punctuations in text
\ samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
@@ -124,7 +124,7 @@ Source code for data_juicer.ops.mapper.punctuation_normalization_mapper
'】': ']',
'%': '%',
'►': '-',
- }
+ }
[docs] def process(self, sample):
sample[self.text_key] = ''.join([
diff --git a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html b/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html
index 6b077694e..03f642ec3 100644
--- a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html
+++ b/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html
@@ -83,7 +83,7 @@ Source code for data_juicer.ops.mapper.remove_bibliography_mapper
"""Mapper to remove bibliography at the end of documents in Latex
samples."""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
@@ -96,7 +96,7 @@ Source code for data_juicer.ops.mapper.remove_bibliography_mapper
self.pattern += r'\\begin\{REFERENCES\}|'
self.pattern += r'\\begin\{thebibliography\}|'
self.pattern += r'\\bibliography\{.*\}'
- self.pattern += r').*$'
+ self.pattern += r').*$'
[docs] def process(self, sample):
sample[self.text_key] = re.sub(pattern=self.pattern,
diff --git a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html b/_modules/data_juicer/ops/mapper/remove_comments_mapper.html
index 5eedf088c..33c667184 100644
--- a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html
+++ b/_modules/data_juicer/ops/mapper/remove_comments_mapper.html
@@ -88,7 +88,7 @@ Source code for data_juicer.ops.mapper.remove_comments_mapper
Only support 'tex' \ for now.
"""
- def __init__(self,
+[docs] def __init__(self,
doc_type: Union[str, List[str]] = 'tex',
inline: bool = True,
multiline: bool = True,
@@ -106,7 +106,7 @@ Source code for data_juicer.ops.mapper.remove_comments_mapper
super().__init__(*args, **kwargs)
self.doc_type = doc_type
self.inline = inline
- self.multiline = multiline
+ self.multiline = multiline
[docs] def process(self, sample):
# TODO: remove different comments by sample type
diff --git a/_modules/data_juicer/ops/mapper/remove_header_mapper.html b/_modules/data_juicer/ops/mapper/remove_header_mapper.html
index 0c9da4f6e..b21c95e44 100644
--- a/_modules/data_juicer/ops/mapper/remove_header_mapper.html
+++ b/_modules/data_juicer/ops/mapper/remove_header_mapper.html
@@ -84,7 +84,7 @@ Source code for data_juicer.ops.mapper.remove_header_mapper
"""Mapper to remove headers at the beginning of documents in Latex
samples."""
- def __init__(self, drop_no_head: bool = True, *args, **kwargs):
+[docs] def __init__(self, drop_no_head: bool = True, *args, **kwargs):
"""
Initialization method.
@@ -104,7 +104,7 @@ Source code for data_juicer.ops.mapper.remove_header_mapper
self.pattern += r'\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}'
self.pattern += r')'
- self.drop_no_head = drop_no_head
+ self.drop_no_head = drop_no_head
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html b/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html
index ff9eb9952..d6b0357ae 100644
--- a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html
+++ b/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html
@@ -86,7 +86,7 @@ Source code for data_juicer.ops.mapper.remove_long_words_mapper
class RemoveLongWordsMapper(Mapper):
"""Mapper to remove long words within a specific range."""
- def __init__(self,
+[docs] def __init__(self,
min_len: PositiveInt = 1,
max_len: PositiveInt = sys.maxsize,
*args,
@@ -103,7 +103,7 @@ Source code for data_juicer.ops.mapper.remove_long_words_mapper
"""
super().__init__(*args, **kwargs)
self.min_len = min_len
- self.max_len = max_len
+ self.max_len = max_len
[docs] def should_keep_long_word(self, word):
if self.min_len <= len(word) <= self.max_len:
diff --git a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html b/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html
index 8253ed2a8..7e82c551f 100644
--- a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html
+++ b/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html
@@ -80,7 +80,7 @@ Source code for data_juicer.ops.mapper.remove_specific_chars_mapper
class RemoveSpecificCharsMapper(Mapper):
"""Mapper to clean specific chars in text samples."""
- def __init__(self,
+[docs] def __init__(self,
chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□',
*args,
**kwargs):
@@ -97,7 +97,7 @@ Source code for data_juicer.ops.mapper.remove_specific_chars_mapper
if chars_to_remove:
self.pattern = '[' + '|'.join(chars_to_remove) + ']'
else:
- self.pattern = None
+ self.pattern = None
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html b/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html
index 660a0ed3f..a018dc7f6 100644
--- a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html
+++ b/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html
@@ -87,7 +87,7 @@ Source code for data_juicer.ops.mapper.remove_table_text_mapper
number of tables.
"""
- def __init__(self,
+[docs] def __init__(self,
min_col: from_2_to_20 = 2,
max_col: from_2_to_20 = 20,
*args,
@@ -103,7 +103,7 @@ Source code for data_juicer.ops.mapper.remove_table_text_mapper
super().__init__(*args, **kwargs)
self.min_col = min_col
self.max_col = max_col
- self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}'
+ self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}'
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html b/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html
index 9f7557eee..42076b59a 100644
--- a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html
+++ b/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html
@@ -83,7 +83,7 @@ Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substring
class RemoveWordsWithIncorrectSubstringsMapper(Mapper):
"""Mapper to remove words with incorrect substrings."""
- def __init__(self,
+[docs] def __init__(self,
lang: str = 'en',
tokenization: bool = False,
substrings: List = None,
@@ -106,7 +106,7 @@ Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substring
self.lang = lang
if tokenization:
self.model_key = prepare_model(lang=lang,
- model_type='sentencepiece')
+ model_type='sentencepiece')
[docs] def should_keep_word_with_incorrect_substrings(self, word, substrings):
word = strip(word, SPECIAL_CHARACTERS)
diff --git a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html b/_modules/data_juicer/ops/mapper/sentence_split_mapper.html
index ed1552d42..07d6522b6 100644
--- a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html
+++ b/_modules/data_juicer/ops/mapper/sentence_split_mapper.html
@@ -79,7 +79,7 @@ Source code for data_juicer.ops.mapper.sentence_split_mapper
class SentenceSplitMapper(Mapper):
"""Mapper to split text samples to sentences."""
- def __init__(self, lang: str = 'en', *args, **kwargs):
+[docs] def __init__(self, lang: str = 'en', *args, **kwargs):
"""
Initialization method.
@@ -89,7 +89,7 @@ Source code for data_juicer.ops.mapper.sentence_split_mapper
"""
super().__init__(*args, **kwargs)
self.lang = lang
- self.model_key = prepare_model(lang=lang, model_type='nltk')
+ self.model_key = prepare_model(lang=lang, model_type='nltk')
[docs] def process(self, sample):
diff --git a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html b/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html
index ff6535bd1..0ecd132bf 100644
--- a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html
+++ b/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html
@@ -86,14 +86,14 @@ Source code for data_juicer.ops.mapper.whitespace_normalization_mapper
<
https://en.wikipedia.org/wiki/Whitespace_character
"""
- def __init__(self, *args, **kwargs):
+[docs] def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
- super().__init__(*args, **kwargs)
+ super().__init__(*args, **kwargs)
[docs] def process(self, sample):
# remove whitespaces before and after the main content
diff --git a/_modules/data_juicer/ops/op_fusion.html b/_modules/data_juicer/ops/op_fusion.html
index 25922ad8d..78475498d 100644
--- a/_modules/data_juicer/ops/op_fusion.html
+++ b/_modules/data_juicer/ops/op_fusion.html
@@ -177,14 +177,14 @@ Source code for data_juicer.ops.op_fusion
[docs]class FusedFilter(Filter):
"""A fused operator for filters."""
- def __init__(self, fused_filters: List):
+[docs] def __init__(self, fused_filters: List):
"""
Initialization method.
:param fused_filers: a list of filters to be fused.
"""
super().__init__()
- self.fused_filters = fused_filters
+ self.fused_filters = fused_filters
[docs] def compute_stats(self, sample):
# context for the intermediate vars
diff --git a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html b/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html
index dec801b69..c68e22df8 100644
--- a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html
+++ b/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html
@@ -81,7 +81,7 @@ Source code for data_juicer.ops.selector.frequency_specified_field_selector<
"""Selector to select samples based on the sorted frequency of specified
field."""
- def __init__(self,
+[docs] def __init__(self,
field_key: str = '',
top_ratio: ClosedUnitInterval = None,
topk: PositiveInt = None,
@@ -114,7 +114,7 @@ Source code for data_juicer.ops.selector.frequency_specified_field_selector<
self.field_key = field_key
self.top_ratio = top_ratio
self.topk = topk
- self.reverse = reverse
+ self.reverse = reverse
[docs] def process(self, dataset):
if len(dataset) <= 1 or not self.field_key:
diff --git a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html
index 6ebf81c65..f5302e79a 100644
--- a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html
+++ b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html
@@ -92,7 +92,7 @@ Source code for data_juicer.ops.selector.topk_specified_field_selector
<
"""Selector to select top samples based on the sorted specified field
value."""
- def __init__(self,
+[docs] def __init__(self,
field_key: str = '',
top_ratio: ClosedUnitInterval = None,
topk: PositiveInt = None,
@@ -125,7 +125,7 @@ Source code for data_juicer.ops.selector.topk_specified_field_selector
<
self.field_key = field_key
self.top_ratio = top_ratio
self.topk = topk
- self.reverse = reverse
+ self.reverse = reverse
[docs] def process(self, dataset):
if len(dataset) <= 1 or not self.field_key:
diff --git a/_modules/data_juicer/utils/ckpt_utils.html b/_modules/data_juicer/utils/ckpt_utils.html
index 857d86de7..f8062a475 100644
--- a/_modules/data_juicer/utils/ckpt_utils.html
+++ b/_modules/data_juicer/utils/ckpt_utils.html
@@ -86,7 +86,7 @@ Source code for data_juicer.utils.ckpt_utils
rerun from the beginning.
"""
- def __init__(self, ckpt_dir, original_process_list, num_proc=1):
+[docs] def __init__(self, ckpt_dir, original_process_list, num_proc=1):
"""
Initialization method.
@@ -101,7 +101,7 @@ Source code for data_juicer.utils.ckpt_utils
self.num_proc = num_proc
self.op_record = []
- self.ckpt_available = self.check_ckpt()
+ self.ckpt_available = self.check_ckpt()
[docs] def get_left_process_list(self):
"""
diff --git a/_modules/data_juicer/utils/compress.html b/_modules/data_juicer/utils/compress.html
index fe83f1afa..70bfaf5d7 100644
--- a/_modules/data_juicer/utils/compress.html
+++ b/_modules/data_juicer/utils/compress.html
@@ -250,7 +250,7 @@ Source code for data_juicer.utils.compress
using compression format algorithms.
"""
- def __init__(self, compressor_format: str = 'zstd'):
+[docs] def __init__(self, compressor_format: str = 'zstd'):
"""
Initialization method.
@@ -261,7 +261,7 @@ Source code for data_juicer.utils.compress
assert compressor_format in Compressor.compressors.keys()
self.compressor_format = compressor_format
self.compressor = Compressor
- self.extractor = Extractor
+ self.extractor = Extractor
[docs] def compress(
self,
@@ -295,7 +295,7 @@ Source code for data_juicer.utils.compress
using compression format algorithms.
"""
- def __init__(self, compressor_format: str = 'zstd'):
+[docs] def __init__(self, compressor_format: str = 'zstd'):
"""
Initialization method.
@@ -306,7 +306,7 @@ Source code for data_juicer.utils.compress
self.compressor_extension = '.' + compressor_format
self.compress_manager = CompressManager(
compressor_format=compressor_format)
- self.pattern = re.compile('_\d{5}_of_') # noqa W605
+ self.pattern = re.compile('_\d{5}_of_') # noqa W605
def _get_raw_filename(self, filename: Union[Path, str]):
"""
diff --git a/_modules/data_juicer/utils/fingerprint_utils.html b/_modules/data_juicer/utils/fingerprint_utils.html
index 590626228..19b99ed8f 100644
--- a/_modules/data_juicer/utils/fingerprint_utils.html
+++ b/_modules/data_juicer/utils/fingerprint_utils.html
@@ -86,8 +86,8 @@ Source code for data_juicer.utils.fingerprint_utils
dispatch: Dict = {}
- def __init__(self):
- self.m = xxhash.xxh64()
+
[docs] @classmethod
def hash_bytes(cls, value: Union[bytes, List[bytes]]) -> str:
diff --git a/_modules/data_juicer/utils/logger_utils.html b/_modules/data_juicer/utils/logger_utils.html
index c034586b7..cf2e76dfe 100644
--- a/_modules/data_juicer/utils/logger_utils.html
+++ b/_modules/data_juicer/utils/logger_utils.html
@@ -114,7 +114,7 @@ Source code for data_juicer.utils.logger_utils
[docs]class StreamToLoguru:
"""Stream object that redirects writes to a logger instance."""
- def __init__(self, level='INFO', caller_names=('datasets', 'logging')):
+
[docs] def __init__(self, level='INFO', caller_names=('datasets', 'logging')):
"""
Initialization method.
@@ -124,7 +124,7 @@ Source code for data_juicer.utils.logger_utils
"""
self.level = level
self.linebuf = ''
- self.caller_names = caller_names
+ self.caller_names = caller_names
[docs] def write(self, buf):
full_name = get_caller_name(depth=1)
diff --git a/_modules/data_juicer/utils/registry.html b/_modules/data_juicer/utils/registry.html
index 134abaa41..d07b77ab0 100644
--- a/_modules/data_juicer/utils/registry.html
+++ b/_modules/data_juicer/utils/registry.html
@@ -95,14 +95,14 @@ Source code for data_juicer.utils.registry
"""This class is used to register some modules to registry by a repo
name."""
- def __init__(self, name: str):
+[docs] def __init__(self, name: str):
"""
Initialization method.
:param name: a registry repo name
"""
self._name = name
- self._modules = {}
+ self._modules = {}
@property
def name(self):
diff --git a/data_juicer.analysis.html b/data_juicer.analysis.html
index 6c08d1363..672474713 100644
--- a/data_juicer.analysis.html
+++ b/data_juicer.analysis.html
@@ -108,6 +108,19 @@ d a t a _ j u i c e r . a n a l y s i sclass data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]¶
Bases: object
Apply analysis on each column of stats respectively.
+
+-
+__init__(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]¶
+Initialization method
+:param dataset: the dataset to be analysed
+:param output_path: path to store the analysis results
+:param overall_result: optional precomputed overall stats result
+:param save_stats_in_one_file: whether save all analysis figures of all
+
+stats into one image file
+
+
+
-
analyse(show_percentiles=False, show=False)[source]¶
@@ -201,6 +214,15 @@ d a t a _ j u i c e r . a n a l y s i sobject
Apply diversity analysis for each sample and get an overall analysis
result.
+
+-
+__init__(dataset, output_path, lang_or_model='en')[source]¶
+Initialization method :param dataset: the dataset to be analysed
+:param output_path: path to store the analysis results :param
+lang_or_model: the diversity model or a specific language used to load
+the diversity model.
+
+
-
analyse(lang_or_model=None, column_name='text', postproc_func=<function get_diversity>, **postproc_kwarg)[source]¶
@@ -307,6 +329,20 @@ d a t a _ j u i c e r . a n a l y s i sobject
Apply analysis on the overall stats, including mean, std, quantiles,
etc.
+
+-
+__init__(dataset, output_path)[source]¶
+Initialization method.
+
+- Parameters:
+
+dataset – the dataset to be analysed
+output_path – path to store the analysis results.
+
+
+
+
+
-
analyse(percentiles=[])[source]¶
diff --git a/data_juicer.core.html b/data_juicer.core.html
index 0b560b21e..3f3c1c769 100644
--- a/data_juicer.core.html
+++ b/data_juicer.core.html
@@ -126,6 +126,17 @@ d a t a _ j u i c e r . c o r e
+-
+__init__(cfg=None)[source]¶
+Initialization method.
+
+- Parameters:
+cfg – optional config dict.
+
+
+
+
-
run(load_data_np=None)[source]¶
@@ -150,6 +161,11 @@ d a t a _ j u i c e r . c o r eclass data_juicer.core.data.NestedDataset(*args, **kargs)[source]¶
Bases: Dataset
Enhanced HuggingFace-Dataset for better usability and efficiency.
+
+
-
add_column(*args, **kargs)[source]¶
@@ -214,6 +230,11 @@ d a t a _ j u i c e r . c o r eclass data_juicer.core.data.NestedDatasetDict(*args, **kargs)[source]¶
Bases: DatasetDict
Enhanced HuggingFace-DatasetDict for better usability and efficiency.
+
+
-
map(**args)[source]¶
@@ -228,6 +249,11 @@ d a t a _ j u i c e r . c o r eclass data_juicer.core.data.NestedQueryDict(*args, **kargs)[source]¶
Bases: dict
Enhanced dict for better usability.
+
+
@@ -288,6 +314,17 @@ d a t a _ j u i c e r . c o r e
+-
+__init__(cfg=None)[source]¶
+Initialization method.
+
+- Parameters:
+cfg – optional config dict.
+
+
+
+
-
run(load_data_np=None)[source]¶
@@ -333,6 +370,25 @@ d a t a _ j u i c e r . c o r eTiB = 1099511627776¶
+
+-
+__init__(export_path, export_shard_size=0, export_in_parallel=True, num_proc=1, export_ds=True, export_stats=True)[source]¶
+Initialization method.
+
+- Parameters:
+
+export_path – the path to export datasets.
+export_shard_size – the size of each shard of exported
+dataset. In default, it’s 0, which means export the dataset
+to a single file.
+num_proc – number of process to export the dataset.
+export_ds – whether to export the dataset contents.
+export_stats – whether to export the stats of dataset.
+
+
+
+
+
-
export(dataset)[source]¶
@@ -402,6 +458,17 @@ d a t a _ j u i c e r . c o r e
+-
+__init__(cfg=None)[source]¶
+Initialization method.
+
+- Parameters:
+cfg – optional config dict.
+
+
+
+
-
run(load_data_np=None)[source]¶
@@ -428,6 +495,22 @@ d a t a _ j u i c e r . c o r e
+-
+__init__(work_dir, show_num=10)[source]¶
+Initialization method.
+
+- Parameters:
+
+work_dir – the work directory to store the comparison
+results
+show_num – the maximum number of samples to show in the
+comparison result files.
+
+
+
+
+
-
trace_batch_mapper(op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str)[source]¶
diff --git a/data_juicer.format.html b/data_juicer.format.html
index 37f558120..32d3cbffb 100644
--- a/data_juicer.format.html
+++ b/data_juicer.format.html
@@ -137,6 +137,21 @@ d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.csv']¶
+
+
@@ -160,6 +175,27 @@ d a t a _ j u i c e r . f o r m a tBaseFormatter
The class is used to load a dataset from local files or local
directory.
+
+-
+__init__(dataset_path: str, type: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]¶
+Initialization method.
+
+- Parameters:
+
+dataset_path – path to a dataset file or a dataset
+directory
+type – a packaged dataset module type (json, csv, etc.)
+suffixes – files with specified suffixes to be processed
+text_keys – key names of field that stores sample
+text.
+add_suffix – whether to add the file suffix to dataset
+meta info
+kwargs – extra args
+
+
+
+
+
-
load_dataset(num_proc: int = 1) Dataset [source]¶
@@ -186,6 +222,22 @@ d a t a _ j u i c e r . f o r m a tBaseFormatter
The class is used to load a dataset from repository of huggingface
hub.
+
+-
+__init__(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]¶
+Initialization method.
+
+- Parameters:
+
+dataset_path – a dataset file or a dataset directory
+text_keys – key names of field that stores sample
+text.
+kwargs – extra args
+
+
+
+
+
-
load_dataset(num_proc: int = 1) Dataset [source]¶
@@ -277,6 +329,21 @@ d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.json', '.jsonl', '.jsonl.zst']¶
+
+
@@ -314,6 +381,26 @@ d a t a _ j u i c e r . f o r m a t
+-
+__init__(dataset_path: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys=None, add_suffix=False, **kwargs)[source]¶
+Initialization method.
+
+- Parameters:
+
+dataset_path – a dataset file or a dataset dir or a list
+of them, optional weights, default 1.0 e.g. <w1> ds.jsonl
+<w2> ds_dir <w3> ds_file.json
+suffixes – files with specified suffixes to be processed
+text_keys – key names of field that stores sample text.
+add_suffix – whether to add the file suffix to dataset
+meta info
+kwargs – extra args
+
+
+
+
+
-
load_dataset(num_proc: int = 1) Dataset [source]¶
@@ -344,6 +431,21 @@ d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.parquet']¶
+
+
@@ -360,6 +462,23 @@ d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', '.m', '.smali']¶
+
+-
+__init__(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]¶
+Initialization method.
+
+- Parameters:
+
+dataset_path – a dataset file or a dataset directory
+suffixes – files with specified suffixes to be processed
+add_suffix – Whether to add file suffix to datase meta
+info
+kwargs – extra args
+
+
+
+
+
-
load_dataset(num_proc: int = 1) Dataset [source]¶
@@ -418,6 +537,21 @@ d a t a _ j u i c e r . f o r m a tSUFFIXES = ['.tsv']¶
+
+-
+__init__(dataset_path, suffixes=None, **kwargs)[source]¶
+Initialization method.
+
+- Parameters:
+
+dataset_path – a dataset file or a dataset directory
+suffixes – files with specified suffixes to be processed
+kwargs – extra args, e.g. delimiter = ‘,’
+
+
+
+
+
diff --git a/data_juicer.html b/data_juicer.html
index 43f8860d6..9ba6948e0 100644
--- a/data_juicer.html
+++ b/data_juicer.html
@@ -87,6 +87,7 @@ d a t a _ j u i c e rd a t a _ j u i c e r . a n a l y s i s
- data_juicer.analysis.column_wise_analysis
ColumnWiseAnalysis
+ColumnWiseAnalysis.__init__()
ColumnWiseAnalysis.analyse()
ColumnWiseAnalysis.draw_box()
ColumnWiseAnalysis.draw_hist()
@@ -97,6 +98,7 @@ d a t a _ j u i c e r
- data_juicer.analysis.diversity_analysis
DiversityAnalysis
@@ -108,6 +110,7 @@ d a t a _ j u i c e r
- data_juicer.analysis.overall_analysis
OverallAnalysis
@@ -129,6 +132,7 @@ d a t a _ j u i c e rd a t a _ j u i c e r . c o r e
- data_juicer.core.analyser
Analyser
@@ -136,6 +140,7 @@ d a t a _ j u i c e r
- data_juicer.core.data
NestedDataset
+NestedDataset.__init__()
NestedDataset.add_column()
NestedDataset.cleanup_cache_files()
NestedDataset.filter()
@@ -147,10 +152,14 @@ d a t a _ j u i c e r
NestedDatasetDict
-NestedQueryDict
+NestedQueryDict
+
nested_obj_factory()
nested_query()
wrap_func_with_nested_access()
@@ -158,6 +167,7 @@ d a t a _ j u i c e r
- data_juicer.core.executor
Executor
@@ -169,6 +179,7 @@ d a t a _ j u i c e rExporter.KiB
Exporter.MiB
Exporter.TiB
+Exporter.__init__()
Exporter.export()
Exporter.to_jsonl()
Exporter.to_parquet()
@@ -178,6 +189,7 @@ d a t a _ j u i c e r
- data_juicer.core.ray_executor
RayExecutor
@@ -185,6 +197,7 @@ d a t a _ j u i c e r
- data_juicer.core.tracer
Tracer
+Tracer.__init__()
Tracer.trace_batch_mapper()
Tracer.trace_deduplicator()
Tracer.trace_filter()
@@ -199,6 +212,7 @@ d a t a _ j u i c e rdata_juicer.format.csv_formatter
@@ -209,10 +223,12 @@ d a t a _ j u i c e r
LocalFormatter
RemoteFormatter
@@ -224,6 +240,7 @@ d a t a _ j u i c e rdata_juicer.format.json_formatter
@@ -234,6 +251,7 @@ d a t a _ j u i c e r
- data_juicer.format.mixture_formatter
MixtureFormatter
@@ -242,6 +260,7 @@ d a t a _ j u i c e rdata_juicer.format.parquet_formatter
@@ -249,6 +268,7 @@ d a t a _ j u i c e rdata_juicer.format.text_formatter
TextFormatter
@@ -259,6 +279,7 @@ d a t a _ j u i c e rdata_juicer.format.tsv_formatter
@@ -462,21 +483,25 @@ d a t a _ j u i c e r
- data_juicer.ops.base_op
Deduplicator
Filter
Mapper
Selector
@@ -488,6 +513,7 @@ d a t a _ j u i c e r
- data_juicer.ops.op_fusion
FusedFilter
@@ -507,6 +533,7 @@ d a t a _ j u i c e rdata_juicer.utils.cache_utils
- data_juicer.utils.ckpt_utils
CheckpointManager
+CheckpointManager.__init__()
CheckpointManager.check_ckpt()
CheckpointManager.check_ops_to_skip()
CheckpointManager.get_left_process_list()
@@ -523,6 +550,7 @@ d a t a _ j u i c e r
CacheCompressManager
+CacheCompressManager.__init__()
CacheCompressManager.cleanup_cache_files()
CacheCompressManager.compress()
CacheCompressManager.decompress()
@@ -530,6 +558,7 @@ d a t a _ j u i c e r
CompressManager
@@ -609,6 +638,7 @@ d a t a _ j u i c e r
- data_juicer.utils.fingerprint_utils
Hasher
+Hasher.__init__()
Hasher.dispatch
Hasher.hash()
Hasher.hash_bytes()
@@ -624,6 +654,7 @@ d a t a _ j u i c e rdata_juicer.utils.logger_utils