From 18400d569267a7a4b723811eb03a7050660063c3 Mon Sep 17 00:00:00 2001 From: yxdyc Date: Thu, 26 Dec 2024 06:50:59 +0000 Subject: [PATCH] deploy: 7d5f37d6f7d5c41d135c7ff28ca5330c85cbbfec --- .buildinfo | 4 +- _modules/data_juicer.html | 22 +- _modules/data_juicer/analysis/collector.html | 188 - .../analysis/column_wise_analysis.html | 39 +- .../analysis/diversity_analysis.html | 45 +- _modules/data_juicer/analysis/draw.html | 154 - _modules/data_juicer/analysis/measure.html | 372 -- .../analysis/overall_analysis.html | 30 +- _modules/data_juicer/config/config.html | 77 +- _modules/data_juicer/core/adapter.html | 40 +- _modules/data_juicer/core/analyzer.html | 28 +- _modules/data_juicer/core/data.html | 99 +- _modules/data_juicer/core/executor.html | 30 +- _modules/data_juicer/core/exporter.html | 36 +- _modules/data_juicer/core/monitor.html | 43 +- _modules/data_juicer/core/ray_data.html | 411 -- _modules/data_juicer/core/ray_executor.html | 210 - _modules/data_juicer/core/tracer.html | 34 +- .../data_juicer/format/csv_formatter.html | 26 +- .../data_juicer/format/empty_formatter.html | 34 +- _modules/data_juicer/format/formatter.html | 57 +- .../data_juicer/format/json_formatter.html | 26 +- _modules/data_juicer/format/load.html | 24 +- .../data_juicer/format/mixture_formatter.html | 30 +- .../data_juicer/format/parquet_formatter.html | 26 +- .../data_juicer/format/text_formatter.html | 38 +- .../data_juicer/format/tsv_formatter.html | 26 +- .../entity_attribute_aggregator.html | 32 +- .../most_relavant_entities_aggregator.html | 32 +- .../ops/aggregator/nested_aggregator.html | 32 +- _modules/data_juicer/ops/base_op.html | 153 +- .../data_juicer/ops/common/helper_func.html | 58 +- .../deduplicator/document_deduplicator.html | 30 +- .../document_minhash_deduplicator.html | 40 +- .../document_simhash_deduplicator.html | 30 +- .../ops/deduplicator/image_deduplicator.html | 35 +- .../deduplicator/ray_basic_deduplicator.html | 32 +- .../ray_document_deduplicator.html | 28 +- .../deduplicator/ray_image_deduplicator.html | 33 +- .../deduplicator/ray_video_deduplicator.html | 28 +- .../ops/deduplicator/video_deduplicator.html | 30 +- .../ops/filter/alphanumeric_filter.html | 30 +- .../ops/filter/audio_duration_filter.html | 30 +- .../ops/filter/audio_nmf_snr_filter.html | 40 +- .../ops/filter/audio_size_filter.html | 30 +- .../filter/average_line_length_filter.html | 30 +- .../filter/character_repetition_filter.html | 30 +- .../ops/filter/flagged_words_filter.html | 30 +- .../ops/filter/image_aesthetics_filter.html | 30 +- .../ops/filter/image_aspect_ratio_filter.html | 30 +- .../ops/filter/image_face_count_filter.html | 30 +- .../ops/filter/image_face_ratio_filter.html | 30 +- .../ops/filter/image_nsfw_filter.html | 30 +- .../filter/image_pair_similarity_filter.html | 30 +- .../ops/filter/image_shape_filter.html | 30 +- .../ops/filter/image_size_filter.html | 30 +- .../filter/image_text_matching_filter.html | 30 +- .../filter/image_text_similarity_filter.html | 30 +- .../ops/filter/image_watermark_filter.html | 30 +- .../ops/filter/language_id_score_filter.html | 30 +- .../filter/maximum_line_length_filter.html | 30 +- .../ops/filter/perplexity_filter.html | 30 +- .../phrase_grounding_recall_filter.html | 45 +- .../ops/filter/special_characters_filter.html | 30 +- .../ops/filter/specified_field_filter.html | 30 +- .../specified_numeric_field_filter.html | 35 +- .../ops/filter/stopwords_filter.html | 30 +- .../data_juicer/ops/filter/suffix_filter.html | 30 +- .../ops/filter/text_action_filter.html | 30 +- .../filter/text_entity_dependency_filter.html | 30 +- .../ops/filter/text_length_filter.html | 30 +- .../ops/filter/token_num_filter.html | 30 +- .../ops/filter/video_aesthetics_filter.html | 30 +- .../ops/filter/video_aspect_ratio_filter.html | 30 +- .../ops/filter/video_duration_filter.html | 30 +- .../video_frames_text_similarity_filter.html | 30 +- .../ops/filter/video_motion_score_filter.html | 39 +- .../video_motion_score_raft_filter.html | 30 +- .../ops/filter/video_nsfw_filter.html | 30 +- .../filter/video_ocr_area_ratio_filter.html | 37 +- .../ops/filter/video_resolution_filter.html | 30 +- .../video_tagging_from_frames_filter.html | 30 +- .../ops/filter/video_watermark_filter.html | 30 +- .../ops/filter/word_repetition_filter.html | 30 +- .../ops/filter/words_num_filter.html | 30 +- .../ops/grouper/key_value_grouper.html | 28 +- .../ops/grouper/naive_grouper.html | 28 +- _modules/data_juicer/ops/load.html | 24 +- .../mapper/audio_ffmpeg_wrapped_mapper.html | 28 +- .../ops/mapper/calibrate_qa_mapper.html | 32 +- .../ops/mapper/calibrate_query_mapper.html | 26 +- .../ops/mapper/calibrate_response_mapper.html | 26 +- .../ops/mapper/chinese_convert_mapper.html | 33 +- .../ops/mapper/clean_copyright_mapper.html | 28 +- .../ops/mapper/clean_email_mapper.html | 28 +- .../ops/mapper/clean_html_mapper.html | 28 +- .../ops/mapper/clean_ip_mapper.html | 28 +- .../ops/mapper/clean_links_mapper.html | 28 +- .../ops/mapper/expand_macro_mapper.html | 28 +- .../extract_entity_attribute_mapper.html | 30 +- .../extract_entity_relation_mapper.html | 34 +- .../ops/mapper/extract_event_mapper.html | 30 +- .../ops/mapper/extract_keyword_mapper.html | 30 +- .../ops/mapper/extract_nickname_mapper.html | 30 +- .../mapper/extract_support_text_mapper.html | 28 +- .../ops/mapper/fix_unicode_mapper.html | 28 +- .../generate_qa_from_examples_mapper.html | 32 +- .../mapper/generate_qa_from_text_mapper.html | 30 +- .../ops/mapper/image_blur_mapper.html | 28 +- .../image_captioning_from_gpt4v_mapper.html | 33 +- .../ops/mapper/image_captioning_mapper.html | 28 +- .../ops/mapper/image_diffusion_mapper.html | 28 +- .../ops/mapper/image_face_blur_mapper.html | 28 +- .../ops/mapper/image_tagging_mapper.html | 28 +- .../ops/mapper/nlpaug_en_mapper.html | 28 +- .../ops/mapper/nlpcda_zh_mapper.html | 28 +- .../ops/mapper/optimize_qa_mapper.html | 32 +- .../ops/mapper/optimize_query_mapper.html | 26 +- .../ops/mapper/optimize_response_mapper.html | 26 +- .../ops/mapper/pair_preference_mapper.html | 32 +- .../punctuation_normalization_mapper.html | 28 +- .../ops/mapper/python_file_mapper.html | 30 +- .../ops/mapper/python_lambda_mapper.html | 30 +- .../ops/mapper/relation_identity_mapper.html | 30 +- .../mapper/remove_bibliography_mapper.html | 28 +- .../ops/mapper/remove_comments_mapper.html | 28 +- .../ops/mapper/remove_header_mapper.html | 28 +- .../ops/mapper/remove_long_words_mapper.html | 30 +- .../remove_non_chinese_character_mapper.html | 28 +- .../remove_repeat_sentences_mapper.html | 33 +- .../mapper/remove_specific_chars_mapper.html | 28 +- .../ops/mapper/remove_table_text_mapper.html | 28 +- ...ords_with_incorrect_substrings_mapper.html | 30 +- .../ops/mapper/replace_content_mapper.html | 28 +- .../ops/mapper/sentence_split_mapper.html | 28 +- .../ops/mapper/text_chunk_mapper.html | 32 +- .../video_captioning_from_audio_mapper.html | 28 +- .../video_captioning_from_frames_mapper.html | 28 +- ...deo_captioning_from_summarizer_mapper.html | 28 +- .../video_captioning_from_video_mapper.html | 28 +- .../mapper/video_extract_frames_mapper.html | 28 +- .../ops/mapper/video_face_blur_mapper.html | 28 +- .../mapper/video_ffmpeg_wrapped_mapper.html | 28 +- .../mapper/video_remove_watermark_mapper.html | 28 +- .../video_resize_aspect_ratio_mapper.html | 33 +- .../video_resize_resolution_mapper.html | 28 +- .../video_split_by_duration_mapper.html | 35 +- .../video_split_by_key_frame_mapper.html | 35 +- .../mapper/video_split_by_scene_mapper.html | 33 +- .../video_tagging_from_audio_mapper.html | 28 +- .../video_tagging_from_frames_mapper.html | 28 +- .../whitespace_normalization_mapper.html | 28 +- _modules/data_juicer/ops/op_fusion.html | 332 -- .../frequency_specified_field_selector.html | 28 +- .../ops/selector/random_selector.html | 28 +- .../range_specified_field_selector.html | 28 +- .../topk_specified_field_selector.html | 28 +- _modules/data_juicer/utils/asset_utils.html | 170 - .../data_juicer/utils/auto_install_utils.html | 221 - _modules/data_juicer/utils/cache_utils.html | 188 - _modules/data_juicer/utils/ckpt_utils.html | 270 - _modules/data_juicer/utils/common_utils.html | 277 - _modules/data_juicer/utils/compress.html | 690 --- _modules/data_juicer/utils/constant.html | 399 -- _modules/data_juicer/utils/file_utils.html | 345 -- .../data_juicer/utils/fingerprint_utils.html | 281 - _modules/data_juicer/utils/lazy_loader.html | 184 - _modules/data_juicer/utils/logger_utils.html | 307 -- _modules/data_juicer/utils/mm_utils.html | 1165 ---- _modules/data_juicer/utils/model_utils.html | 982 ---- _modules/data_juicer/utils/process_utils.html | 237 - _modules/data_juicer/utils/registry.html | 258 - .../data_juicer/utils/resource_utils.html | 173 - .../data_juicer/utils/unittest_utils.html | 263 - _modules/index.html | 47 +- _sources/data_juicer.analysis.rst.txt | 60 +- _sources/data_juicer.config.rst.txt | 20 +- _sources/data_juicer.core.rst.txt | 84 +- _sources/data_juicer.format.rst.txt | 84 +- _sources/data_juicer.ops.aggregator.rst.txt | 36 +- _sources/data_juicer.ops.common.rst.txt | 28 +- _sources/data_juicer.ops.deduplicator.rst.txt | 84 +- _sources/data_juicer.ops.filter.rst.txt | 364 +- _sources/data_juicer.ops.grouper.rst.txt | 28 +- _sources/data_juicer.ops.mapper.rst.txt | 524 +- _sources/data_juicer.ops.rst.txt | 50 +- _sources/data_juicer.ops.selector.rst.txt | 44 +- _sources/data_juicer.rst.txt | 23 +- _sources/data_juicer.tools.rst.txt | 9 +- _sources/data_juicer.utils.rst.txt | 164 +- _static/basic.css | 15 +- _static/doctools.js | 7 - _static/language_data.js | 7 - _static/searchtools.js | 38 +- data_juicer.analysis.html | 523 +- data_juicer.config.html | 216 +- data_juicer.core.html | 1012 +--- data_juicer.format.html | 616 +-- data_juicer.html | 1208 +--- data_juicer.ops.aggregator.html | 330 +- data_juicer.ops.common.html | 247 +- data_juicer.ops.deduplicator.html | 609 +- data_juicer.ops.filter.html | 3044 +--------- data_juicer.ops.grouper.html | 139 +- data_juicer.ops.html | 2675 +-------- data_juicer.ops.mapper.html | 3666 +----------- data_juicer.ops.selector.html | 280 +- data_juicer.tools.html | 33 +- data_juicer.utils.html | 2331 +------- genindex.html | 4891 ++--------------- index.html | 381 +- modules.html | 116 +- objects.inv | Bin 16752 -> 7512 bytes py-modindex.html | 905 +-- search.html | 22 +- searchindex.js | 2 +- 216 files changed, 3578 insertions(+), 34295 deletions(-) delete mode 100644 _modules/data_juicer/analysis/collector.html delete mode 100644 _modules/data_juicer/analysis/draw.html delete mode 100644 _modules/data_juicer/analysis/measure.html delete mode 100644 _modules/data_juicer/core/ray_data.html delete mode 100644 _modules/data_juicer/core/ray_executor.html delete mode 100644 _modules/data_juicer/ops/op_fusion.html delete mode 100644 _modules/data_juicer/utils/asset_utils.html delete mode 100644 _modules/data_juicer/utils/auto_install_utils.html delete mode 100644 _modules/data_juicer/utils/cache_utils.html delete mode 100644 _modules/data_juicer/utils/ckpt_utils.html delete mode 100644 _modules/data_juicer/utils/common_utils.html delete mode 100644 _modules/data_juicer/utils/compress.html delete mode 100644 _modules/data_juicer/utils/constant.html delete mode 100644 _modules/data_juicer/utils/file_utils.html delete mode 100644 _modules/data_juicer/utils/fingerprint_utils.html delete mode 100644 _modules/data_juicer/utils/lazy_loader.html delete mode 100644 _modules/data_juicer/utils/logger_utils.html delete mode 100644 _modules/data_juicer/utils/mm_utils.html delete mode 100644 _modules/data_juicer/utils/model_utils.html delete mode 100644 _modules/data_juicer/utils/process_utils.html delete mode 100644 _modules/data_juicer/utils/registry.html delete mode 100644 _modules/data_juicer/utils/resource_utils.html delete mode 100644 _modules/data_juicer/utils/unittest_utils.html diff --git a/.buildinfo b/.buildinfo index 3940ce529..7a3a9d48e 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 -# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 02acd820f6eb43d6f533ae13ad9142b0 +# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. +config: d21389c0a148f57cab87e3135f4aa3e2 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_modules/data_juicer.html b/_modules/data_juicer.html index 9ddcf12c4..1fc60dc18 100644 --- a/_modules/data_juicer.html +++ b/_modules/data_juicer.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@ diff --git a/_modules/data_juicer/analysis/collector.html b/_modules/data_juicer/analysis/collector.html deleted file mode 100644 index f05c53043..000000000 --- a/_modules/data_juicer/analysis/collector.html +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - - - data_juicer.analysis.collector — data_juicer 1.0.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.collector

-from itertools import chain
-
-from data_juicer.format import load_formatter
-from data_juicer.utils.lazy_loader import LazyLoader
-
-torch = LazyLoader('torch', 'torch')
-transformers = LazyLoader('transformers', 'transformers')
-
-
-
-[docs] -class TextTokenDistCollector(object): - """Tokenize and collect distribution of tokens for given - dataset with a specified tokenizer. - """ - -
-[docs] - def __init__(self, tokenizer): - """ - Initialization method. - - :param tokenizer: tokenizer name on huggingface - """ - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - tokenizer, trust_remote_code=True) - self.vocab_size = len(self.tokenizer)
- - -
-[docs] - def collect(self, - data_path, - text_key, - num_proc=1) -> 'torch.distributions.Categorical': - """ - Tokenize and collect tokens distribution of input dataset - :param data_path: path to input dataset. - :param text_key: field keys that will be considered into token counts. - :param num_proc: number of processes to count tokens. - :return: token distribution. - """ - - formatter = load_formatter(data_path) - dataset = formatter.load_dataset(num_proc=num_proc) - assert text_key in dataset.features, f'[{text_key} not find in dataset' - - def prepare_tokenizer( - tokenizer, - text_key, - ): - """ - Prepare a tokenizer function for dataset. - :param tokenizer: a tokenizer to tokenize sample. - :param text_key: field keys that will be - considered into token counts. - """ - - def _tokenize_fn(example, ): - example = tokenizer(example[text_key], - add_special_tokens=False) - return example - - return _tokenize_fn - - tokenize_proc = prepare_tokenizer(self.tokenizer, text_key) - dataset = dataset.map(tokenize_proc, - num_proc=num_proc, - desc=f'tokenize {data_path.split("/")[-1]}') - - token_count = torch.zeros(self.vocab_size, dtype=torch.int64) - token_ids = torch.tensor( - list(chain.from_iterable(dataset['input_ids']))) - indices, counts = token_ids.unique(return_counts=True) - token_count.scatter_(0, indices, counts.to(token_count.dtype)) - dist = torch.distributions.Categorical(token_count) - return dist
-
- -
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html index 88f2ba97f..00680847c 100644 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ b/_modules/data_juicer/analysis/column_wise_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@ @@ -90,8 +90,6 @@

Source code for data_juicer.analysis.column_wise_analysis

from .overall_analysis import OverallAnalysis -
-[docs] def get_row_col(total_num, factor=2): """ Given the total number of stats figures, get the "best" number of rows and @@ -130,17 +128,16 @@

Source code for data_juicer.analysis.column_wise_analysis

for i in range(total_num): grids.append((i // now_col, i % now_col)) - return int(now_row), int(now_col), grids
- + return int(now_row), int(now_col), grids
-[docs] +[docs] class ColumnWiseAnalysis: """Apply analysis on each column of stats respectively."""
-[docs] +[docs] def __init__(self, dataset, output_path, @@ -176,7 +173,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def analyze(self, show_percentiles=False, show=False, skip_export=False): """ Apply analysis and draw the analysis figure for stats. @@ -294,7 +291,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_hist(self, ax, data, save_path, percentiles=None, show=False): """ Draw the histogram for the data. @@ -355,7 +352,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_box(self, ax, data, save_path, percentiles=None, show=False): """ Draw the box plot for the data. @@ -406,7 +403,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_wordcloud(self, ax, data, save_path, show=False): word_list = data.tolist() word_nums = {} diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html index 104fa13dd..6556ae401 100644 --- a/_modules/data_juicer/analysis/diversity_analysis.html +++ b/_modules/data_juicer/analysis/diversity_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -88,8 +88,6 @@

Source code for data_juicer.analysis.diversity_analysis

# Modify from self_instruct, please refer to # https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb -
-[docs] def find_root_verb_and_its_dobj(tree_root): """ Find the verb and its object closest to the root. @@ -110,14 +108,11 @@

Source code for data_juicer.analysis.diversity_analysis

for child in tree_root.children: return find_root_verb_and_its_dobj(child) # if no children satisfy the condition, return None - return None, None
- + return None, None # Modify from self_instruct, please refer to # https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb -
-[docs] def find_root_verb_and_its_dobj_in_string(nlp, s, first_sent=True): """ Find the verb and its object closest to the root of lexical tree of input @@ -136,12 +131,9 @@

Source code for data_juicer.analysis.diversity_analysis

verb, noun = find_root_verb_and_its_dobj(sent.root) if first_sent or (verb is not None and noun is not None): return verb, noun - return None, None
- + return None, None -
-[docs] def get_diversity(dataset, top_k_verbs=20, top_k_nouns=4, **kwargs): """ Given the lexical tree analysis result, return the diversity results. @@ -166,18 +158,17 @@

Source code for data_juicer.analysis.diversity_analysis

df = df.groupby('verb').apply(lambda x: x.sort_values( 'count', ascending=False).head(top_k_nouns)).reset_index(drop=True) - return df
- + return df
-[docs] +[docs] class DiversityAnalysis: """Apply diversity analysis for each sample and get an overall analysis result."""
-[docs] +[docs] def __init__(self, dataset, output_path, lang_or_model='en'): """Initialization method :param dataset: the dataset to be analyzed :param output_path: path to store the analysis results :param @@ -192,7 +183,7 @@

Source code for data_juicer.analysis.diversity_analysis

-[docs] +[docs] def compute(self, lang_or_model=None, column_name='text'): """ Apply lexical tree analysis on each sample. @@ -226,7 +217,7 @@

Source code for data_juicer.analysis.diversity_analysis

-[docs] +[docs] def analyze(self, lang_or_model=None, column_name='text', diff --git a/_modules/data_juicer/analysis/draw.html b/_modules/data_juicer/analysis/draw.html deleted file mode 100644 index 5a7cc7a3e..000000000 --- a/_modules/data_juicer/analysis/draw.html +++ /dev/null @@ -1,154 +0,0 @@ - - - - - - - - data_juicer.analysis.draw — data_juicer 1.0.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.draw

-import matplotlib.pyplot as plt
-import numpy as np
-import seaborn as sns
-
-
-
-[docs] -def draw_heatmap(data, xlabels, ylables=None, figsize=None, triangle=False): - """ - Draw heatmap of input data with special lables. - - :param data: input data, now support - [`list`, `tuple`, `numpy array`, 'torch tensor'] - :param xlabels: x axis labels. - :param ylabels: y axis labels, if None, use xlabels. - :param figsize: figure size. - :param triangle: only display triangle. - :return: a plot figure. - """ - figsize = figsize if figsize else (8 * 2.5, 6 * 2.5) - _, ax = plt.subplots(figsize=figsize) - mask = None - if triangle: - mask = np.triu(np.ones_like(data)) - ax.tick_params( - right=True, - top=True, - labelright=True, - labeltop=True, - ) - sns.heatmap(data, - ax=ax, - cmap='Oranges', - annot=True, - mask=mask, - linewidths=.05, - square=True, - xticklabels=xlabels, - yticklabels=ylables, - annot_kws={'size': 8}) - plt.subplots_adjust(left=.1, right=0.95, bottom=0.22, top=0.95) - fig = plt.gcf() - plt.show() - return fig
- -
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/measure.html b/_modules/data_juicer/analysis/measure.html deleted file mode 100644 index 61d48df0f..000000000 --- a/_modules/data_juicer/analysis/measure.html +++ /dev/null @@ -1,372 +0,0 @@ - - - - - - - - data_juicer.analysis.measure — data_juicer 1.0.2 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.measure

-import numpy as np
-
-from data_juicer.utils.lazy_loader import LazyLoader
-
-torch = LazyLoader('torch', 'torch')
-td = LazyLoader('td', 'torch.distributions')
-F = LazyLoader('F', 'torch.nn.functional')
-
-stats = LazyLoader('stats', 'scipy.stats')
-
-
-
-[docs] -class Measure(object): - """Base class for Measure distribution. - """ - name = 'base' - -
-[docs] - def measure(self, *args, **kwargs): - pass
- - - def __call__(self, *args, **kwargs): - return self.measure(*args, **kwargs) - - def _convert_to_tensor(self, p): - """ - Convert input data to torch tensor. - :param p: input data, now support - [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. - :return: torch tensor - """ - if isinstance(p, torch.Tensor): - return p - elif isinstance(p, td.Categorical): - return p.probs - elif isinstance(p, str): - return torch.load(p) - else: - return torch.tensor(p) - - def _convert_to_categorical(self, p): - """ - Convert input data to torch Categorical. - :param p: input data, now support - [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. - :return: torch Categorical - """ - if isinstance(p, td.Categorical): - return p - elif isinstance(p, torch.Tensor): - return td.Categorical(p) - elif isinstance(p, str): - return td.Categorical(torch.load(p)) - else: - return td.Categorical(torch.tensor(p)) - - def _convert_to_ndarray(self, p): - """ - Convert input data to torch tensor. - :param p: input data, now support - [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. - :return: torch tensor - """ - return self._convert_to_tensor(p).numpy()
- - - -
-[docs] -class KLDivMeasure(Measure): - """ - Measure Kullback-Leibler divergence. - """ - name = 'kl_divergence' - -
-[docs] - def measure(self, p, q): - p = self._convert_to_categorical(p) - q = self._convert_to_categorical(q) - assert p.probs.shape == q.probs.shape, \ - 'The two inputs have different shape:' \ - f'{p.probs.shape} != {q.probs.shape} in {self.name}' - return F.kl_div(q.logits, p.probs, log_target=False, reduction='sum')
-
- - - -
-[docs] -class JSDivMeasure(Measure): - """ - Measure Jensen-Shannon divergence. - """ - name = 'js_divergence' - -
-[docs] - def measure(self, p, q): - p = self._convert_to_tensor(p) - q = self._convert_to_tensor(q) - assert p.shape == q.shape, \ - 'The two inputs have different shape:' \ - f'{p.shape} != {q.shape} in {self.name}' - - m = 0.5 * (p + q) - kl_p = KLDivMeasure()(p, m) - kl_q = KLDivMeasure()(q, m) - js = 0.5 * (kl_p + kl_q) - return js
-
- - - -
-[docs] -class CrossEntropyMeasure(Measure): - """ - Measure Cross-Entropy. - """ - name = 'cross_entropy' - -
-[docs] - def measure(self, p, q): - p = self._convert_to_categorical(p) - q = self._convert_to_categorical(q) - assert p.probs.shape == q.probs.shape, \ - 'The two inputs have different shape: '\ - f'{p.probs.shape} != {q.probs.shape} in {self.name}' - return F.cross_entropy(q.logits, p.probs, reduction='sum')
-
- - - -
-[docs] -class EntropyMeasure(Measure): - """ - Measure Entropy. - """ - name = 'entropy' - -
-[docs] - def measure(self, p): - p = self._convert_to_categorical(p) - return p.entropy()
-
- - - -
-[docs] -class RelatedTTestMeasure(Measure): - """ - Measure T-Test for two related distributions on their histogram of the same - bins. - - Ref: - https://en.wikipedia.org/wiki/Student%27s_t-test - - For continuous features or distributions, the input could be dataset stats - list. - For discrete features or distributions, the input could be the tags or the - categories list. - """ - name = 't-test' - -
-[docs] - @staticmethod - def stats_to_hist(p, q): - p = np.array(p) - q = np.array(q) - - # get common maximum number of data samples, and max/min values - max_data_num = max(len(p), len(q)) - min_val = min(min(p), min(q)) - max_val = max(max(p), max(q)) - - # get a recommended number of bins - rec_bins = max(int(np.sqrt(max_data_num)), 10) - - # get the common bin edges - common_p = np.append(p, [min_val, max_val]) - hist_p, bin_edges = np.histogram(common_p, bins=rec_bins) - # restore the hist of the original p - hist_p[0] -= 1 - hist_p[-1] -= 1 - # get the hist of the original q using the common bin edges - hist_q, _ = np.histogram(q, bins=bin_edges) - return hist_p, hist_q, bin_edges
- - -
-[docs] - @staticmethod - def category_to_hist(p, q): - - def flatten_list(lst): - res = [] - for s in lst: - if isinstance(s, list): - res.extend(flatten_list(s)) - else: - res.append(s) - return res - - # flatten the list - p = flatten_list(p) - q = flatten_list(q) - - # get the common categories - cat_p = set(p) - cat_q = set(q) - cat_common = cat_p.union(cat_q) - - # get category distributions - count_p = {cat: 0 for cat in cat_common} - count_q = {cat: 0 for cat in cat_common} - for cat in p: - count_p[cat] += 1 - for cat in q: - count_q[cat] += 1 - - # only keep distribution values sorted by counts - sorted_cat = list(count_p.items()) - sorted_cat.sort(key=lambda it: it[1], reverse=True) - sorted_cat = [it[0] for it in sorted_cat] - # get the value dist - hist_p = [count_p[cat] for cat in sorted_cat] - hist_q = [count_q[cat] for cat in sorted_cat] - - return hist_p, hist_q, count_p, count_q, sorted_cat
- - -
-[docs] - def measure(self, p, q): - """ - :param p: the first feature or distribution. (stats/tags/categories) - :param q: the second feature or distribution. (stats/tags/categories) - :return: the T-Test results object -- ([ref](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats._result_classes.TtestResult.html#scipy.stats._result_classes.TtestResult)) # noqa: E501 - """ - ele = p[0] - while isinstance(ele, list): - ele = ele[0] - if isinstance(ele, str): - # discrete tags or categories - hist_p, hist_q = self.category_to_hist(p, q)[:2] - else: - # continuous stats - hist_p, hist_q = self.stats_to_hist(p, q)[:2] - - # compute the t-test and pval for hist_p and hist_q - ttest_res = stats.ttest_rel(hist_p, hist_q) - return ttest_res
-
- -
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html index e86453abe..b50c3cc6b 100644 --- a/_modules/data_juicer/analysis/overall_analysis.html +++ b/_modules/data_juicer/analysis/overall_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -93,13 +93,13 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] class OverallAnalysis: """Apply analysis on the overall stats, including mean, std, quantiles, etc."""
-[docs] +[docs] def __init__(self, dataset, output_path): """ Initialization method. @@ -129,7 +129,7 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] def refine_single_column(self, col): if col.dtype != 'object': # not an object, return directly @@ -152,7 +152,7 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] def analyze(self, percentiles=[], num_proc=1, skip_export=False): """ Apply overall analysis on the whole dataset based on the describe diff --git a/_modules/data_juicer/config/config.html b/_modules/data_juicer/config/config.html index 283b02b33..9adf1de29 100644 --- a/_modules/data_juicer/config/config.html +++ b/_modules/data_juicer/config/config.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -103,7 +103,7 @@

Source code for data_juicer.config.config

 
 
 
-[docs] +[docs] def init_configs(args: Optional[List[str]] = None, which_entry: object = None): """ initialize the jsonargparse parser and parse configs from one of: @@ -481,8 +481,6 @@

Source code for data_juicer.config.config

 
 
 
-