diff --git a/.github/workflows/deploy_spinx_docs.yml b/.github/workflows/deploy_sphinx_docs.yml similarity index 52% rename from .github/workflows/deploy_spinx_docs.yml rename to .github/workflows/deploy_sphinx_docs.yml index 27cff9c39..9c8ae89a0 100644 --- a/.github/workflows/deploy_spinx_docs.yml +++ b/.github/workflows/deploy_sphinx_docs.yml @@ -1,9 +1,13 @@ name: Deploy Sphinx documentation to Pages on: - release: - types: [published] - workflow_dispatch: + pull_request: + types: [opened, synchronize] + paths: + - 'docs/sphinx_doc/**/*' + push: + branches: + - main jobs: pages: @@ -19,14 +23,18 @@ jobs: run: | python -m pip install --upgrade pip pip install -v -e .[dev] - - id: deployment - uses: sphinx-notes/pages@v3 + - id: build + name: Build Documentation + run: | + cd docs/sphinx_doc + bash build_doc.sh + - name: Upload Documentation + uses: actions/upload-artifact@v3 with: - documentation_path: ./docs/sphinx_doc/source - python_version: ${{ matrix.python-version }} - publish: false - requirements_path: ./environments/dev_requires.txt + name: SphinxDoc + path: 'docs/sphinx_doc/build/html' - uses: peaceiris/actions-gh-pages@v3 + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ${{ steps.deployment.outputs.artifact }} + publish_dir: 'docs/sphinx_doc/build/html' diff --git a/data_juicer/analysis/__init__.py b/data_juicer/analysis/__init__.py index 78db975a1..e4ae41aa8 100644 --- a/data_juicer/analysis/__init__.py +++ b/data_juicer/analysis/__init__.py @@ -1,2 +1,9 @@ from .column_wise_analysis import ColumnWiseAnalysis +from .diversity_analysis import DiversityAnalysis from .overall_analysis import OverallAnalysis + +__all__ = [ + 'ColumnWiseAnalysis', + 'DiversityAnalysis', + 'OverallAnalysis', +] diff --git a/data_juicer/config/__init__.py b/data_juicer/config/__init__.py index 853722dec..b33c6e755 100644 --- a/data_juicer/config/__init__.py +++ b/data_juicer/config/__init__.py @@ -1 +1,7 @@ -from .config import * # noqa: F401,F403 +from .config import export_config, init_configs, merge_config + +__all__ = [ + 'init_configs', + 'export_config', + 'merge_config', +] diff --git a/data_juicer/core/__init__.py b/data_juicer/core/__init__.py index cf712d21a..28a8c6d39 100644 --- a/data_juicer/core/__init__.py +++ b/data_juicer/core/__init__.py @@ -3,3 +3,11 @@ from .executor import Executor from .exporter import Exporter from .tracer import Tracer + +__all__ = [ + 'Analyser', + 'NestedDataset', + 'Executor', + 'Exporter', + 'Tracer', +] diff --git a/data_juicer/format/__init__.py b/data_juicer/format/__init__.py index cd2e10de0..e25ec9921 100644 --- a/data_juicer/format/__init__.py +++ b/data_juicer/format/__init__.py @@ -1,3 +1,16 @@ from . import (csv_formatter, json_formatter, mixture_formatter, parquet_formatter, text_formatter, tsv_formatter) +from .csv_formatter import CsvFormatter +from .formatter import LocalFormatter, RemoteFormatter +from .json_formatter import JsonFormatter from .load import load_formatter +from .mixture_formatter import MixtureFormatter +from .parquet_formatter import ParquetFormatter +from .text_formatter import TextFormatter +from .tsv_formatter import TsvFormatter + +__all__ = [ + 'load_formatter', 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter', + 'TextFormatter', 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter', + 'MixtureFormatter' +] diff --git a/data_juicer/ops/__init__.py b/data_juicer/ops/__init__.py index c35fc22bb..ae8256850 100644 --- a/data_juicer/ops/__init__.py +++ b/data_juicer/ops/__init__.py @@ -1,3 +1,11 @@ from . import deduplicator, filter, mapper, selector from .base_op import OPERATORS, Deduplicator, Filter, Mapper, Selector from .load import load_ops + +__all__ = [ + 'load_ops', + 'Filter', + 'Mapper', + 'Deduplicator', + 'Selector', +] diff --git a/data_juicer/ops/common/__init__.py b/data_juicer/ops/common/__init__.py index 1218b9b12..74e8dd33d 100644 --- a/data_juicer/ops/common/__init__.py +++ b/data_juicer/ops/common/__init__.py @@ -3,3 +3,14 @@ split_on_newline_tab_whitespace, split_on_whitespace, strip, words_augmentation, words_refinement) from .special_characters import SPECIAL_CHARACTERS + +__all__ = [ + 'get_sentences_from_document', + 'get_words_from_document', + 'merge_on_whitespace_tab_newline', + 'split_on_newline_tab_whitespace', + 'split_on_whitespace', + 'strip', + 'words_augmentation', + 'words_refinement', +] diff --git a/data_juicer/ops/common/helper_func.py b/data_juicer/ops/common/helper_func.py index c8a29bf8a..58e43d36f 100644 --- a/data_juicer/ops/common/helper_func.py +++ b/data_juicer/ops/common/helper_func.py @@ -134,8 +134,8 @@ def get_words_from_document( :param document: document that need to split words. :param token_func: function of tokenizer, if specified, the function - will be used for split document into different tokens. - :param new_line: whether to use `\\\\n' to split words. + will be used for split document into different tokens. + :param new_line: whether to use '\\\\n' to split words. :param tab: whether to use '\\\\t' to split words. :return: word list obtained from document """ diff --git a/data_juicer/ops/deduplicator/__init__.py b/data_juicer/ops/deduplicator/__init__.py index b95e91a80..69f73b361 100644 --- a/data_juicer/ops/deduplicator/__init__.py +++ b/data_juicer/ops/deduplicator/__init__.py @@ -2,3 +2,18 @@ document_simhash_deduplicator, image_deduplicator, ray_document_deduplicator, ray_image_deduplicator, ray_video_deduplicator, video_deduplicator) +from .document_deduplicator import DocumentDeduplicator +from .document_minhash_deduplicator import DocumentMinhashDeduplicator +from .document_simhash_deduplicator import DocumentSimhashDeduplicator +from .image_deduplicator import ImageDeduplicator +from .ray_basic_deduplicator import RayBasicDeduplicator +from .ray_document_deduplicator import RayDocumentDeduplicator +from .ray_image_deduplicator import RayImageDeduplicator +from .ray_video_deduplicator import RayVideoDeduplicator +from .video_deduplicator import VideoDeduplicator + +__all__ = [ + 'VideoDeduplicator', 'RayBasicDeduplicator', 'DocumentMinhashDeduplicator', + 'RayImageDeduplicator', 'RayDocumentDeduplicator', 'DocumentDeduplicator', + 'ImageDeduplicator', 'DocumentSimhashDeduplicator', 'RayVideoDeduplicator' +] diff --git a/data_juicer/ops/deduplicator/ray_document_deduplicator.py b/data_juicer/ops/deduplicator/ray_document_deduplicator.py index 9f8d6cd91..e12eb149f 100644 --- a/data_juicer/ops/deduplicator/ray_document_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_document_deduplicator.py @@ -29,7 +29,7 @@ def __init__(self, :param redis_port: the port of redis server :param lowercase: Whether to convert sample text to lower case :param ignore_non_character: Whether to ignore non-alphabet - characters, including whitespaces, digits, and punctuations + characters, including whitespaces, digits, and punctuations :param args: extra args :param kwargs: extra args. """ diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index ad6718268..056da04cd 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -18,5 +18,91 @@ video_nsfw_filter, video_ocr_area_ratio_filter, video_resolution_filter, video_tagging_from_frames_filter, video_watermark_filter, word_num_filter, word_repetition_filter) +from .alphanumeric_filter import AlphanumericFilter +from .audio_duration_filter import AudioDurationFilter +from .audio_nmf_snr_filter import AudioNMFSNRFilter +from .audio_size_filter import AudioSizeFilter +from .average_line_length_filter import AverageLineLengthFilter +from .character_repetition_filter import CharacterRepetitionFilter +from .flagged_words_filter import FlaggedWordFilter +from .image_aesthetics_filter import ImageAestheticsFilter +from .image_aspect_ratio_filter import ImageAspectRatioFilter +from .image_face_ratio_filter import ImageFaceRatioFilter +from .image_nsfw_filter import ImageNSFWFilter +from .image_shape_filter import ImageShapeFilter +from .image_size_filter import ImageSizeFilter +from .image_text_matching_filter import ImageTextMatchingFilter +from .image_text_similarity_filter import ImageTextSimilarityFilter +from .image_watermark_filter import ImageWatermarkFilter +from .language_id_score_filter import LanguageIDScoreFilter +from .maximum_line_length_filter import MaximumLineLengthFilter +from .perplexity_filter import PerplexityFilter +from .phrase_grounding_recall_filter import PhraseGroundingRecallFilter +from .special_characters_filter import SpecialCharactersFilter +from .specified_field_filter import SpecifiedFieldFilter +from .specified_numeric_field_filter import SpecifiedNumericFieldFilter +from .stopwords_filter import StopWordsFilter +from .suffix_filter import SuffixFilter +from .text_action_filter import TextActionFilter +from .text_entity_dependency_filter import TextEntityDependencyFilter +from .text_length_filter import TextLengthFilter +from .token_num_filter import TokenNumFilter +from .video_aesthetics_filter import VideoAestheticsFilter +from .video_aspect_ratio_filter import VideoAspectRatioFilter +from .video_duration_filter import VideoDurationFilter +from .video_frames_text_similarity_filter import \ + VideoFramesTextSimilarityFilter +from .video_motion_score_filter import VideoMotionScoreFilter +from .video_nsfw_filter import VideoNSFWFilter +from .video_ocr_area_ratio_filter import VideoOcrAreaRatioFilter +from .video_resolution_filter import VideoResolutionFilter +from .video_tagging_from_frames_filter import VideoTaggingFromFramesFilter +from .video_watermark_filter import VideoWatermarkFilter +from .word_num_filter import WordNumFilter +from .word_repetition_filter import WordRepetitionFilter + +__all__ = [ + 'ImageTextSimilarityFilter', + 'VideoAspectRatioFilter', + 'ImageTextMatchingFilter', + 'ImageNSFWFilter', + 'TokenNumFilter', + 'TextLengthFilter', + 'SpecifiedNumericFieldFilter', + 'AudioNMFSNRFilter', + 'VideoAestheticsFilter', + 'PerplexityFilter', + 'PhraseGroundingRecallFilter', + 'MaximumLineLengthFilter', + 'AverageLineLengthFilter', + 'SpecifiedFieldFilter', + 'VideoTaggingFromFramesFilter', + 'TextEntityDependencyFilter', + 'VideoResolutionFilter', + 'AlphanumericFilter', + 'ImageWatermarkFilter', + 'ImageAestheticsFilter', + 'AudioSizeFilter', + 'StopWordsFilter', + 'CharacterRepetitionFilter', + 'ImageShapeFilter', + 'VideoDurationFilter', + 'TextActionFilter', + 'VideoOcrAreaRatioFilter', + 'VideoNSFWFilter', + 'SpecialCharactersFilter', + 'VideoFramesTextSimilarityFilter', + 'ImageAspectRatioFilter', + 'AudioDurationFilter', + 'LanguageIDScoreFilter', + 'SuffixFilter', + 'ImageSizeFilter', + 'VideoWatermarkFilter', + 'WordNumFilter', + 'ImageFaceRatioFilter', + 'FlaggedWordFilter', + 'WordRepetitionFilter', + 'VideoMotionScoreFilter', +] # yapf: enable diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index 3166c5aae..90fc4898b 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -24,5 +24,98 @@ video_tagging_from_audio_mapper, video_tagging_from_frames_mapper, whitespace_normalization_mapper) +from .audio_ffmpeg_wrapped_mapper import AudioFFmpegWrappedMapper +from .chinese_convert_mapper import ChineseConvertMapper +from .clean_copyright_mapper import CleanCopyrightMapper +from .clean_email_mapper import CleanEmailMapper +from .clean_html_mapper import CleanHtmlMapper +from .clean_ip_mapper import CleanIpMapper +from .clean_links_mapper import CleanLinksMapper +from .expand_macro_mapper import ExpandMacroMapper +from .fix_unicode_mapper import FixUnicodeMapper +from .image_blur_mapper import ImageBlurMapper +from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper +from .image_captioning_mapper import ImageCaptioningMapper +from .image_diffusion_mapper import ImageDiffusionMapper +from .image_face_blur_mapper import ImageFaceBlurMapper +from .nlpaug_en_mapper import NlpaugEnMapper +from .nlpcda_zh_mapper import NlpcdaZhMapper +from .punctuation_normalization_mapper import PunctuationNormalizationMapper +from .remove_bibliography_mapper import RemoveBibliographyMapper +from .remove_comments_mapper import RemoveCommentsMapper +from .remove_header_mapper import RemoveHeaderMapper +from .remove_long_words_mapper import RemoveLongWordsMapper +from .remove_non_chinese_character_mapper import \ + RemoveNonChineseCharacterlMapper +from .remove_repeat_sentences_mapper import RemoveRepeatSentencesMapper +from .remove_specific_chars_mapper import RemoveSpecificCharsMapper +from .remove_table_text_mapper import RemoveTableTextMapper +from .remove_words_with_incorrect_substrings_mapper import \ + RemoveWordsWithIncorrectSubstringsMapper +from .replace_content_mapper import ReplaceContentMapper +from .sentence_split_mapper import SentenceSplitMapper +from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper +from .video_captioning_from_frames_mapper import \ + VideoCaptioningFromFramesMapper +from .video_captioning_from_summarizer_mapper import \ + VideoCaptioningFromSummarizerMapper +from .video_captioning_from_video_mapper import VideoCaptioningFromVideoMapper +from .video_face_blur_mapper import VideoFaceBlurMapper +from .video_ffmpeg_wrapped_mapper import VideoFFmpegWrappedMapper +from .video_remove_watermark_mapper import VideoRemoveWatermarkMapper +from .video_resize_aspect_ratio_mapper import VideoResizeAspectRatioMapper +from .video_resize_resolution_mapper import VideoResizeResolutionMapper +from .video_split_by_duration_mapper import VideoSplitByDurationMapper +from .video_split_by_key_frame_mapper import VideoSplitByKeyFrameMapper +from .video_split_by_scene_mapper import VideoSplitBySceneMapper +from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper +from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper +from .whitespace_normalization_mapper import WhitespaceNormalizationMapper + +__all__ = [ + 'VideoCaptioningFromAudioMapper', + 'VideoTaggingFromAudioMapper', + 'ImageCaptioningFromGPT4VMapper', + 'PunctuationNormalizationMapper', + 'RemoveBibliographyMapper', + 'SentenceSplitMapper', + 'VideoSplitBySceneMapper', + 'CleanIpMapper', + 'CleanLinksMapper', + 'RemoveHeaderMapper', + 'RemoveTableTextMapper', + 'VideoRemoveWatermarkMapper', + 'RemoveRepeatSentencesMapper', + 'ImageDiffusionMapper', + 'ImageFaceBlurMapper', + 'VideoFFmpegWrappedMapper', + 'ChineseConvertMapper', + 'NlpcdaZhMapper', + 'ImageBlurMapper', + 'CleanCopyrightMapper', + 'RemoveNonChineseCharacterlMapper', + 'VideoSplitByKeyFrameMapper', + 'RemoveSpecificCharsMapper', + 'VideoResizeAspectRatioMapper', + 'CleanHtmlMapper', + 'WhitespaceNormalizationMapper', + 'VideoTaggingFromFramesMapper', + 'RemoveCommentsMapper', + 'ExpandMacroMapper', + 'ImageCaptioningMapper', + 'RemoveWordsWithIncorrectSubstringsMapper', + 'VideoCaptioningFromVideoMapper', + 'VideoCaptioningFromSummarizerMapper', + 'FixUnicodeMapper', + 'NlpaugEnMapper', + 'VideoCaptioningFromFramesMapper', + 'RemoveLongWordsMapper', + 'VideoResizeResolutionMapper', + 'CleanEmailMapper', + 'ReplaceContentMapper', + 'AudioFFmpegWrappedMapper', + 'VideoSplitByDurationMapper', + 'VideoFaceBlurMapper', +] # yapf: enable diff --git a/data_juicer/ops/mapper/image_face_blur_mapper.py b/data_juicer/ops/mapper/image_face_blur_mapper.py index e4ec8f4a6..8ba01b61b 100644 --- a/data_juicer/ops/mapper/image_face_blur_mapper.py +++ b/data_juicer/ops/mapper/image_face_blur_mapper.py @@ -33,7 +33,7 @@ def __init__(self, Initialization method. :param blur_type: Type of blur kernel, including - ['mean', 'box', 'gaussian']. + ['mean', 'box', 'gaussian']. :param radius: Radius of blur kernel. :param args: extra args :param kwargs: extra args diff --git a/data_juicer/ops/mapper/video_face_blur_mapper.py b/data_juicer/ops/mapper/video_face_blur_mapper.py index 17a3f6d54..a3abb233e 100644 --- a/data_juicer/ops/mapper/video_face_blur_mapper.py +++ b/data_juicer/ops/mapper/video_face_blur_mapper.py @@ -32,7 +32,7 @@ def __init__(self, Initialization method. :param blur_type: Type of blur kernel, including - ['mean', 'box', 'gaussian']. + ['mean', 'box', 'gaussian']. :param radius: Radius of blur kernel. :param args: extra args :param kwargs: extra args diff --git a/data_juicer/ops/selector/__init__.py b/data_juicer/ops/selector/__init__.py index cf0977321..c37998a9a 100644 --- a/data_juicer/ops/selector/__init__.py +++ b/data_juicer/ops/selector/__init__.py @@ -1 +1,5 @@ from . import frequency_specified_field_selector, topk_specified_field_selector +from .frequency_specified_field_selector import FrequencySpecifiedFieldSelector +from .topk_specified_field_selector import TopkSpecifiedFieldSelector + +__all__ = ['FrequencySpecifiedFieldSelector', 'TopkSpecifiedFieldSelector'] diff --git a/data_juicer/utils/process_utils.py b/data_juicer/utils/process_utils.py index 3a5ec7d43..2aa60e19f 100644 --- a/data_juicer/utils/process_utils.py +++ b/data_juicer/utils/process_utils.py @@ -2,7 +2,6 @@ import subprocess import psutil -import torch from loguru import logger from data_juicer import cuda_device_count, use_cuda @@ -10,6 +9,7 @@ def get_min_cuda_memory(): # get cuda memory info using "nvidia-smi" command + import torch min_cuda_memory = torch.cuda.get_device_properties( 0).total_memory / 1024**2 nvidia_smi_output = subprocess.check_output([ @@ -23,6 +23,7 @@ def get_min_cuda_memory(): def calculate_np(num_proc, op, op_name): + """Calculate the optimum number of processes for the given OP""" if num_proc is None: num_proc = psutil.cpu_count() if use_cuda() and op._accelerator == 'cuda': diff --git a/docs/DeveloperGuide.md b/docs/DeveloperGuide.md index 21c5f67b4..cb53ca327 100644 --- a/docs/DeveloperGuide.md +++ b/docs/DeveloperGuide.md @@ -196,6 +196,12 @@ class StatsKeys(object): ```python from . import (..., # other OPs text_length_filter) # import this new OP module +# other OPs +from text_length_filter import TextLengthFilter # import this new OP class +__all__ = [ + # other Ops + text_length_filter, # add this new Op to __all__ +] ``` 4. Now you can use this new OP with custom arguments in your own config files! @@ -279,7 +285,6 @@ the corresponding documents, including the following docs: 3. `docs/Operators_ZH.md`: this doc is the Chinese version of the doc in 6.ii, so we need to update the Chinese content at the same positions. - 4. `docs/sphinx_doc/source/data_juicer.ops.{filter | mapper | deduplicator | selector}.rst`: this doc is the index of API reference. When the operator file name is modified or an operator file is added or deleted, the corresponding entries in the file need to be updated accordingly. ### (Optional) Make your OP fusible diff --git a/docs/DeveloperGuide_ZH.md b/docs/DeveloperGuide_ZH.md index dbdf18997..6af83c3a9 100644 --- a/docs/DeveloperGuide_ZH.md +++ b/docs/DeveloperGuide_ZH.md @@ -187,9 +187,14 @@ class StatsKeys(object): 3. 实现后,将其添加到 `data_juicer/ops/filter` 目录下 `__init__.py` 文件中的算子字典中: ```python -from . import (..., # other ops - text_length_filter) # import this new op module - +from . import (..., # other OPs + text_length_filter) # import this new OP module +# other OPs +from text_length_filter import TextLengthFilter # import this new OP class +__all__ = [ + # other Ops + text_length_filter, # add this new Op to __all__ +] ``` 4. 全部完成!现在您可以在自己的配置文件中使用新添加的算子: @@ -268,7 +273,6 @@ if __name__ == '__main__': 3. `docs/Operators_ZH.md`:该文档为6.ii中`docs/Operators.md`文档的中文版,需要更新相同位置处的中文内容。 - 4. `docs/sphinx_doc/source/data_juicer.ops.{filter | mapper | deduplicator | selector}.rst`: 该文档为 API 文档索引,在修改算子文件名称或增删算子文件的情况下需要对应更新文件中对应的条目。 ### (可选)使新算子可以进行算子融合 diff --git a/docs/sphinx_doc/README.md b/docs/sphinx_doc/README.md index eb4372201..43127bbeb 100644 --- a/docs/sphinx_doc/README.md +++ b/docs/sphinx_doc/README.md @@ -9,17 +9,11 @@ doc, please run the following commands: # 1. install the sphinx requirements and init the sphinx-quickstart pip install sphinx sphinx-autobuild sphinx_rtd_theme recommonmark # or pip install -r ../../environments/dev_requires -sphinx-quickstart -# 2. auto generate the doc files for all sub modules (*.rst) from source codes -sphinx-apidoc -o source ../../data_juicer +# 2. auto generate and build the doc +./build_doc.sh -# 3. modify the auto-generated files according to your requirements -vim source/modules.rst - -# 4. finalize the doc, which is stored in the `build/html` directory -make clean -make html +# 3. finalize the doc, which is stored in the `build/html` directory mv build/html position_to_publish ``` diff --git a/docs/sphinx_doc/README_ZH.md b/docs/sphinx_doc/README_ZH.md index 4f57ea167..e02179532 100644 --- a/docs/sphinx_doc/README_ZH.md +++ b/docs/sphinx_doc/README_ZH.md @@ -8,17 +8,10 @@ Data-Juicer 借助 Sphinx 构建 API 文档。 # 1.安装 sphinx 的依赖并初始化 sphinx-quickstart pip install sphinx sphinx-autobuild sphinx_rtd_theme recommonmark # or pip install -r ../../environments/dev_requires -sphinx-quickstart +# 2. 运行文档构建脚本 +./build_doc.sh -# 2. 从源代码自动生成所有子模块(*.rst)的文档文件 -sphinx-apidoc -o source ../../data_juicer - -# 3. 根据您的要求修改自动生成的文件 -vim source/modules.rst - -# 4. 完成文档的构建,文档存储目录为 `build/html` -make clean -make html +# 3. 构建完成的文档存储目录为 `build/html` mv build/html position_to_publish ``` diff --git a/docs/sphinx_doc/_templates/module.rst_t b/docs/sphinx_doc/_templates/module.rst_t deleted file mode 100644 index 249027855..000000000 --- a/docs/sphinx_doc/_templates/module.rst_t +++ /dev/null @@ -1,9 +0,0 @@ -{%- if show_headings %} -{{- [basename, "module"] | join(' ') | e | heading }} - -{% endif -%} -.. automodule:: {{ qualname }} -{%- for option in automodule_options %} - :{{ option }}: -{%- endfor %} - diff --git a/docs/sphinx_doc/_templates/package.rst_t b/docs/sphinx_doc/_templates/package.rst_t index cae4ac115..2951c5530 100644 --- a/docs/sphinx_doc/_templates/package.rst_t +++ b/docs/sphinx_doc/_templates/package.rst_t @@ -5,6 +5,8 @@ {%- endfor %} {%- endmacro %} +{{- pkgname | heading }} + {%- macro toctree(docnames) -%} .. toctree:: :maxdepth: {{ maxdepth }} @@ -13,33 +15,4 @@ {%- endfor %} {%- endmacro %} -{%- if is_namespace %} -{{- [pkgname, "namespace"] | join(" ") | e | heading }} -{% else %} -{{- pkgname | e | heading }} -{% endif %} - -{%- if is_namespace %} -.. py:module:: {{ pkgname }} -{% endif %} - -{%- if modulefirst and not is_namespace %} {{ automodule(pkgname, automodule_options) }} -{% endif %} - -{%- if subpackages %} -{{ toctree(subpackages) }} -{% endif %} - -{%- if submodules %} -{% if separatemodules %} -{{ toctree(submodules) }} -{% else %} -{%- for submodule in submodules %} -{% if show_headings %} -{{- submodule | e | heading(2) }} -{% endif %} -{{ automodule(submodule, automodule_options) }} -{% endfor %} -{%- endif %} -{%- endif %} diff --git a/docs/sphinx_doc/_templates/toc.rst_t b/docs/sphinx_doc/_templates/toc.rst_t deleted file mode 100644 index f0877eeb2..000000000 --- a/docs/sphinx_doc/_templates/toc.rst_t +++ /dev/null @@ -1,8 +0,0 @@ -{{ header | heading }} - -.. toctree:: - :maxdepth: {{ maxdepth }} -{% for docname in docnames %} - {{ docname }} -{%- endfor %} - diff --git a/docs/sphinx_doc/build_doc.sh b/docs/sphinx_doc/build_doc.sh new file mode 100755 index 000000000..b22c7d9c3 --- /dev/null +++ b/docs/sphinx_doc/build_doc.sh @@ -0,0 +1,3 @@ +#!/bin/bash +sphinx-apidoc -f -o source ../../data_juicer -t _templates +make clean html \ No newline at end of file diff --git a/docs/sphinx_doc/source/conf.py b/docs/sphinx_doc/source/conf.py index 2d91a83b0..8b5921558 100644 --- a/docs/sphinx_doc/source/conf.py +++ b/docs/sphinx_doc/source/conf.py @@ -24,10 +24,20 @@ extensions = [ 'sphinx.ext.autodoc', + "sphinx.ext.autosummary", 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', + "sphinx.ext.autosectionlabel", ] +# Prefix document path to section labels, otherwise autogenerated labels would +# look like 'heading' rather than 'path/to/file:heading' +autosectionlabel_prefix_document = True +autosummary_generate = True +autosummary_ignore_module_all = False + +autodoc_member_order = "bysource" + templates_path = ['_templates'] exclude_patterns = ['build'] @@ -36,6 +46,10 @@ html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +html_theme_options = { + "navigation_depth": 2, +} + # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". diff --git a/docs/sphinx_doc/source/data_juicer.analysis.rst b/docs/sphinx_doc/source/data_juicer.analysis.rst deleted file mode 100644 index 2a053700f..000000000 --- a/docs/sphinx_doc/source/data_juicer.analysis.rst +++ /dev/null @@ -1,52 +0,0 @@ -data\_juicer.analysis -============================= - - - -data\_juicer.analysis.collector --------------------------------------- - -.. automodule:: data_juicer.analysis.collector - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.analysis.column\_wise\_analysis ---------------------------------------------------- - -.. automodule:: data_juicer.analysis.column_wise_analysis - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.analysis.diversity\_analysis ------------------------------------------------- - -.. automodule:: data_juicer.analysis.diversity_analysis - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.analysis.draw ---------------------------------- - -.. automodule:: data_juicer.analysis.draw - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.analysis.measure ------------------------------------- - -.. automodule:: data_juicer.analysis.measure - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.analysis.overall\_analysis ----------------------------------------------- - -.. automodule:: data_juicer.analysis.overall_analysis - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.config.rst b/docs/sphinx_doc/source/data_juicer.config.rst deleted file mode 100644 index c77412dc4..000000000 --- a/docs/sphinx_doc/source/data_juicer.config.rst +++ /dev/null @@ -1,11 +0,0 @@ -data\_juicer.config -=========================== - - -data\_juicer.config.config ---------------------------------- - -.. automodule:: data_juicer.config.config - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.core.rst b/docs/sphinx_doc/source/data_juicer.core.rst deleted file mode 100644 index 4d2ea7df6..000000000 --- a/docs/sphinx_doc/source/data_juicer.core.rst +++ /dev/null @@ -1,51 +0,0 @@ -data\_juicer.core -========================= - - -data\_juicer.core.analyser ---------------------------------- - -.. automodule:: data_juicer.core.analyser - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.data ------------------------------ - -.. automodule:: data_juicer.core.data - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.executor ---------------------------------- - -.. automodule:: data_juicer.core.executor - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.exporter ---------------------------------- - -.. automodule:: data_juicer.core.exporter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.ray\_executor --------------------------------------- - -.. automodule:: data_juicer.core.ray_executor - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.core.tracer -------------------------------- - -.. automodule:: data_juicer.core.tracer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.format.rst b/docs/sphinx_doc/source/data_juicer.format.rst deleted file mode 100644 index 837bac07e..000000000 --- a/docs/sphinx_doc/source/data_juicer.format.rst +++ /dev/null @@ -1,67 +0,0 @@ -data\_juicer.format -=========================== - - -data\_juicer.format.csv\_formatter ------------------------------------------ - -.. automodule:: data_juicer.format.csv_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.formatter ------------------------------------- - -.. automodule:: data_juicer.format.formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.json\_formatter ------------------------------------------- - -.. automodule:: data_juicer.format.json_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.load -------------------------------- - -.. automodule:: data_juicer.format.load - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.mixture\_formatter ---------------------------------------------- - -.. automodule:: data_juicer.format.mixture_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.parquet\_formatter ---------------------------------------------- - -.. automodule:: data_juicer.format.parquet_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.text\_formatter ------------------------------------------- - -.. automodule:: data_juicer.format.text_formatter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.format.tsv\_formatter ------------------------------------------ - -.. automodule:: data_juicer.format.tsv_formatter - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.ops.common.rst b/docs/sphinx_doc/source/data_juicer.ops.common.rst deleted file mode 100644 index db392dd3d..000000000 --- a/docs/sphinx_doc/source/data_juicer.ops.common.rst +++ /dev/null @@ -1,19 +0,0 @@ -data\_juicer.ops.common -======================= - - -data\_juicer.ops.common.helper\_func -------------------------------------------- - -.. automodule:: data_juicer.ops.common.helper_func - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.common.special\_characters --------------------------------------------------- - -.. automodule:: data_juicer.ops.common.special_characters - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.ops.deduplicator.rst b/docs/sphinx_doc/source/data_juicer.ops.deduplicator.rst deleted file mode 100644 index d4fa76e5f..000000000 --- a/docs/sphinx_doc/source/data_juicer.ops.deduplicator.rst +++ /dev/null @@ -1,43 +0,0 @@ -data\_juicer.ops.deduplicator -===================================== - - -data\_juicer.ops.deduplicator.document\_deduplicator ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.deduplicator.document_deduplicator - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.deduplicator.document\_minhash\_deduplicator --------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.deduplicator.document_minhash_deduplicator - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.deduplicator.document\_simhash\_deduplicator --------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.deduplicator.document_simhash_deduplicator - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.deduplicator.image\_deduplicator --------------------------------------------------------- - -.. automodule:: data_juicer.ops.deduplicator.image_deduplicator - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.deduplicator.video\_deduplicator --------------------------------------------------------- - -.. automodule:: data_juicer.ops.deduplicator.video_deduplicator - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.ops.filter.rst b/docs/sphinx_doc/source/data_juicer.ops.filter.rst deleted file mode 100644 index 1817861f7..000000000 --- a/docs/sphinx_doc/source/data_juicer.ops.filter.rst +++ /dev/null @@ -1,330 +0,0 @@ -data\_juicer.ops.filter -=============================== - -data\_juicer.ops.filter.alphanumeric\_filter ---------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.alphanumeric_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.audio\_duration\_filter ------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.audio_duration_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.audio\_nmf\_snr\_filter ------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.audio_nmf_snr_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.audio\_size\_filter --------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.audio_size_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.average\_line\_length\_filter ------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.average_line_length_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.character\_repetition\_filter ------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.character_repetition_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.flagged\_words\_filter ------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.flagged_words_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_aesthetics\_filter --------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.image_aesthetics_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_aspect\_ratio\_filter ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.image_aspect_ratio_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_face\_ratio\_filter -------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.image_face_ratio_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_nsfw\_filter ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.image_nsfw_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_shape\_filter ---------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.image_shape_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_size\_filter --------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.image_size_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_text\_matching\_filter ------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.image_text_matching_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_text\_similarity\_filter --------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.image_text_similarity_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.image\_watermark\_filter ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.image_watermark_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.language\_id\_score\_filter ----------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.language_id_score_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.maximum\_line\_length\_filter ------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.maximum_line_length_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.perplexity\_filter -------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.perplexity_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.phrase\_grounding\_recall\_filter ----------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.phrase_grounding_recall_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.special\_characters\_filter ----------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.special_characters_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.specified\_field\_filter -------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.specified_field_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.specified\_numeric\_field\_filter ----------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.specified_numeric_field_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.stopwords\_filter ------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.stopwords_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.suffix\_filter ---------------------------------------------- - -.. automodule:: data_juicer.ops.filter.suffix_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.text\_action\_filter ---------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.text_action_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.text\_entity\_dependency\_filter ---------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.text_entity_dependency_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.text\_length\_filter ---------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.text_length_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.token\_num\_filter -------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.token_num_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_aesthetics\_filter --------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.video_aesthetics_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_aspect\_ratio\_filter ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.video_aspect_ratio_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_duration\_filter ------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.video_duration_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_frames\_text\_similarity\_filter ----------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.video_frames_text_similarity_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_motion\_score\_filter ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.video_motion_score_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_nsfw\_filter ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.video_nsfw_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_ocr\_area\_ratio\_filter --------------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.video_ocr_area_ratio_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_resolution\_filter --------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.video_resolution_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_watermark\_filter ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.filter.video_watermark_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.video\_tagging\_from\_frames\_filter --------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.video_tagging_from_frames_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.word\_num\_filter ------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.word_num_filter - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.filter.word\_repetition\_filter -------------------------------------------------------- - -.. automodule:: data_juicer.ops.filter.word_repetition_filter - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.ops.mapper.rst b/docs/sphinx_doc/source/data_juicer.ops.mapper.rst deleted file mode 100644 index 4e8fc7158..000000000 --- a/docs/sphinx_doc/source/data_juicer.ops.mapper.rst +++ /dev/null @@ -1,363 +0,0 @@ -data\_juicer.ops.mapper -=============================== - - -data\_juicer.ops.mapper.audio\_ffmpeg\_wrapped\_mapper -------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.chinese\_convert\_mapper -------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.chinese_convert_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_copyright\_mapper -------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_copyright_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_email\_mapper ---------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_email_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_html\_mapper --------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_html_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_ip\_mapper ------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_ip_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.clean\_links\_mapper ---------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.clean_links_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.expand\_macro\_mapper ----------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.expand_macro_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.fix\_unicode\_mapper ---------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.fix_unicode_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.generate\_caption\_mapper --------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.generate_caption_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.gpt4v\_generate\_mapper ------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.gpt4v_generate_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.image\_blur\_mapper --------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.image_blur_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.image\_captioning\_from\_gpt4v\_mapper ---------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.image\_captioning\_mapper --------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.image_captioning_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.image\_diffusion\_mapper -------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.image_diffusion_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.image\_face\_blur\_mapper -------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.image_face_blur_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.nlpaug\_en\_mapper -------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.nlpaug_en_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.nlpcda\_zh\_mapper -------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.nlpcda_zh_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.punctuation\_normalization\_mapper ------------------------------------------------------------------ - -.. automodule:: data_juicer.ops.mapper.punctuation_normalization_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_bibliography\_mapper ------------------------------------------------------------ - -.. automodule:: data_juicer.ops.mapper.remove_bibliography_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_comments\_mapper -------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_comments_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_header\_mapper ------------------------------------------------------ - -.. automodule:: data_juicer.ops.mapper.remove_header_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_long\_words\_mapper ----------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_long_words_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_non\_chinese\_character\_mapper ----------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_non_chinese_character_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_repeat\_sentences\_mapper ----------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_repeat_sentences_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_specific\_chars\_mapper --------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_specific_chars_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_table\_text\_mapper ----------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_table_text_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.remove\_words\_with\_incorrect\_substrings\_mapper ---------------------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.replace\_content\_mapper -------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.replace_content_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.sentence\_split\_mapper ------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.sentence_split_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_captioning\_from\_audio\_mapper ---------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_captioning_from_audio_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_captioning\_from\_frames\_mapper ---------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_captioning_from_frames_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_captioning\_from\_summarizer\_mapper ---------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_captioning_from_summarizer_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_captioning\_from\_video\_mapper ---------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_captioning_from_video_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_face\_blur\_mapper -------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_face_blur_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_ffmpeg\_wrapped\_mapper -------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_remove\_watermark\_mapper -------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_remove_watermark_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_resize\_aspect\_ratio\_mapper -------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_resize_aspect_ratio_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_resize\_resolution\_mapper ----------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_resize_resolution_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_split\_by\_duration\_mapper ------------------------------------------------------------------ - -.. automodule:: data_juicer.ops.mapper.video_split_by_duration_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_split\_by\_key\_frame\_mapper -------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_split_by_key_frame_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_split\_by\_scene\_mapper --------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_split_by_scene_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_tagging\_from\_audio\_mapper ------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_tagging_from_audio_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.video\_tagging\_from\_frames\_mapper -------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.video_tagging_from_frames_mapper - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.mapper.whitespace\_normalization\_mapper ----------------------------------------------------------------- - -.. automodule:: data_juicer.ops.mapper.whitespace_normalization_mapper - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.ops.rst b/docs/sphinx_doc/source/data_juicer.ops.rst deleted file mode 100644 index 404379fe7..000000000 --- a/docs/sphinx_doc/source/data_juicer.ops.rst +++ /dev/null @@ -1,36 +0,0 @@ -data\_juicer.ops -======================== - -.. toctree:: - :maxdepth: 4 - - data_juicer.ops.common - data_juicer.ops.deduplicator - data_juicer.ops.filter - data_juicer.ops.mapper - data_juicer.ops.selector - - -data\_juicer.ops.base\_op --------------------------------- - -.. automodule:: data_juicer.ops.base_op - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.load ----------------------------- - -.. automodule:: data_juicer.ops.load - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.op\_fusion ----------------------------------- - -.. automodule:: data_juicer.ops.op_fusion - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.ops.selector.rst b/docs/sphinx_doc/source/data_juicer.ops.selector.rst deleted file mode 100644 index 64ef25e6b..000000000 --- a/docs/sphinx_doc/source/data_juicer.ops.selector.rst +++ /dev/null @@ -1,19 +0,0 @@ -data\_juicer.ops.selector -================================= - - -data\_juicer.ops.selector.frequency\_specified\_field\_selector ----------------------------------------------------------------------- - -.. automodule:: data_juicer.ops.selector.frequency_specified_field_selector - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.ops.selector.topk\_specified\_field\_selector ------------------------------------------------------------------ - -.. automodule:: data_juicer.ops.selector.topk_specified_field_selector - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/data_juicer.rst b/docs/sphinx_doc/source/data_juicer.rst deleted file mode 100644 index 076b6f360..000000000 --- a/docs/sphinx_doc/source/data_juicer.rst +++ /dev/null @@ -1,13 +0,0 @@ -data\_juicer -============ - -.. toctree:: - :maxdepth: 4 - - data_juicer.analysis - data_juicer.config - data_juicer.core - data_juicer.format - data_juicer.ops - data_juicer.tools - data_juicer.utils diff --git a/docs/sphinx_doc/source/data_juicer.tools.rst b/docs/sphinx_doc/source/data_juicer.tools.rst deleted file mode 100644 index 6d25049a6..000000000 --- a/docs/sphinx_doc/source/data_juicer.tools.rst +++ /dev/null @@ -1,2 +0,0 @@ -data\_juicer.tools -================== diff --git a/docs/sphinx_doc/source/data_juicer.utils.rst b/docs/sphinx_doc/source/data_juicer.utils.rst deleted file mode 100644 index 280e8db01..000000000 --- a/docs/sphinx_doc/source/data_juicer.utils.rst +++ /dev/null @@ -1,107 +0,0 @@ -data\_juicer.utils -================== - - -data\_juicer.utils.asset\_utils --------------------------------------- - -.. automodule:: data_juicer.utils.asset_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.availability\_utils ---------------------------------------------- - -.. automodule:: data_juicer.utils.availability_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.cache\_utils --------------------------------------- - -.. automodule:: data_juicer.utils.cache_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.ckpt\_utils -------------------------------------- - -.. automodule:: data_juicer.utils.ckpt_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.compress ----------------------------------- - -.. automodule:: data_juicer.utils.compress - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.constant ----------------------------------- - -.. automodule:: data_juicer.utils.constant - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.file\_utils -------------------------------------- - -.. automodule:: data_juicer.utils.file_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.fingerprint\_utils --------------------------------------------- - -.. automodule:: data_juicer.utils.fingerprint_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.logger\_utils ---------------------------------------- - -.. automodule:: data_juicer.utils.logger_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.mm\_utils ------------------------------------ - -.. automodule:: data_juicer.utils.mm_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.model\_utils --------------------------------------- - -.. automodule:: data_juicer.utils.model_utils - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.registry ----------------------------------- - -.. automodule:: data_juicer.utils.registry - :members: - :undoc-members: - :show-inheritance: - -data\_juicer.utils.unittest\_utils ------------------------------------------ - -.. automodule:: data_juicer.utils.unittest_utils - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/sphinx_doc/source/index.rst b/docs/sphinx_doc/source/index.rst index 9c098d834..78f525425 100644 --- a/docs/sphinx_doc/source/index.rst +++ b/docs/sphinx_doc/source/index.rst @@ -8,10 +8,19 @@ Welcome to data-juicer's documentation! .. toctree:: :maxdepth: 2 - :caption: References: - -.. include:: modules.rst + :glob: + :caption: Data-Juicer API Reference + data_juicer.core + data_juicer.ops + data_juicer.ops.filter + data_juicer.ops.mapper + data_juicer.ops.deduplicator + data_juicer.ops.selector + data_juicer.ops.common + data_juicer.analysis + data_juicer.config + data_juicer.format Indices and tables ================== diff --git a/docs/sphinx_doc/source/modules.rst b/docs/sphinx_doc/source/modules.rst deleted file mode 100644 index 2845759f3..000000000 --- a/docs/sphinx_doc/source/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -data_juicer -=========== - -.. toctree:: - :maxdepth: 4 - - data_juicer diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index d7696c75b..7c67a3f65 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -24,3 +24,4 @@ streamlit spacy==3.5.0 multiprocess==0.70.12 dill==0.3.4 +psutil