diff --git a/README.md b/README.md index 6a393fa6b..b03f37ca9 100644 --- a/README.md +++ b/README.md @@ -9,14 +9,17 @@ English | [**中文**](README_ZH.md) ![](https://img.shields.io/badge/license-Apache--2.0-000000.svg) [![Contributing](https://img.shields.io/badge/Contribution-welcome-brightgreen.svg)](docs/DeveloperGuide.md) -[![pypi version](https://img.shields.io/pypi/v/py-data-juicer?color=026cad)](https://pypi.org/project/py-data-juicer) -[![Docker version](https://img.shields.io/docker/v/datajuicer/data-juicer?label=Docker&color=498bdf)](https://hub.docker.com/r/datajuicer/data-juicer) +[![pypi version](https://img.shields.io/pypi/v/py-data-juicer?logo=pypi&color=026cad)](https://pypi.org/project/py-data-juicer) +[![Docker version](https://img.shields.io/docker/v/datajuicer/data-juicer?logo=docker&label=Docker&color=498bdf)](https://hub.docker.com/r/datajuicer/data-juicer) [![Document_List](https://img.shields.io/badge/Docs-English-blue?logo=Markdown)](README.md#documentation) [![文档列表](https://img.shields.io/badge/文档-中文-blue?logo=Markdown)](README_ZH.md#documentation) [![API Reference](https://img.shields.io/badge/Docs-API_Reference-blue?logo=Markdown)](https://alibaba.github.io/data-juicer/) -[![ModelScope-10+ Demos](https://img.shields.io/badge/ModelScope-10+_Demos-4e29ff.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjI0IDEyMS4zMyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCTxwYXRoIGQ9Im0wIDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtOTkuMTQgNzMuNDloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xNzYuMDkgOTkuMTRoLTI1LjY1djIyLjE5aDQ3Ljg0di00Ny44NGgtMjIuMTl6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTEyNC43OSA0Ny44NGgyNS42NXYyNS42NWgtMjUuNjV6IiBmaWxsPSIjMzZjZmQxIiAvPgoJPHBhdGggZD0ibTAgMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xOTguMjggNDcuODRoMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xOTguMjggMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xNTAuNDQgMHYyMi4xOWgyNS42NXYyNS42NWgyMi4xOXYtNDcuODR6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTczLjQ5IDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiMzNmNmZDEiIC8+Cgk8cGF0aCBkPSJtNDcuODQgMjIuMTloMjUuNjV2LTIyLjE5aC00Ny44NHY0Ny44NGgyMi4xOXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtNDcuODQgNzMuNDloLTIyLjE5djQ3Ljg0aDQ3Ljg0di0yMi4xOWgtMjUuNjV6IiBmaWxsPSIjNjI0YWZmIiAvPgo8L3N2Zz4K)](#demos) +[![ModelScope-10+ Demos](https://img.shields.io/badge/ModelScope-10+_Demos-4e29ff.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjI0IDEyMS4zMyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCTxwYXRoIGQ9Im0wIDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtOTkuMTQgNzMuNDloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xNzYuMDkgOTkuMTRoLTI1LjY1djIyLjE5aDQ3Ljg0di00Ny44NGgtMjIuMTl6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTEyNC43OSA0Ny44NGgyNS42NXYyNS42NWgtMjUuNjV6IiBmaWxsPSIjMzZjZmQxIiAvPgoJPHBhdGggZD0ibTAgMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xOTguMjggNDcuODRoMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xOTguMjggMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xNTAuNDQgMHYyMi4xOWgyNS42NXYyNS42NWgyMi4xOXYtNDcuODR6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTczLjQ5IDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiMzNmNmZDEiIC8+Cgk8cGF0aCBkPSJtNDcuODQgMjIuMTloMjUuNjV2LTIyLjE5aC00Ny44NHY0Ny44NGgyMi4xOXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtNDcuODQgNzMuNDloLTIyLjE5djQ3Ljg0aDQ3Ljg0di0yMi4xOWgtMjUuNjV6IiBmaWxsPSIjNjI0YWZmIiAvPgo8L3N2Zz4K)](https://modelscope.cn/studios?name=Data-Jiucer&page=1&sort=latest&type=1) [![ModelScope-20+_Refined_Datasets](https://img.shields.io/badge/ModelScope-20+_Refined_Datasets-4e29ff.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjI0IDEyMS4zMyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCTxwYXRoIGQ9Im0wIDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtOTkuMTQgNzMuNDloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xNzYuMDkgOTkuMTRoLTI1LjY1djIyLjE5aDQ3Ljg0di00Ny44NGgtMjIuMTl6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTEyNC43OSA0Ny44NGgyNS42NXYyNS42NWgtMjUuNjV6IiBmaWxsPSIjMzZjZmQxIiAvPgoJPHBhdGggZD0ibTAgMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xOTguMjggNDcuODRoMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xOTguMjggMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xNTAuNDQgMHYyMi4xOWgyNS42NXYyNS42NWgyMi4xOXYtNDcuODR6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTczLjQ5IDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiMzNmNmZDEiIC8+Cgk8cGF0aCBkPSJtNDcuODQgMjIuMTloMjUuNjV2LTIyLjE5aC00Ny44NHY0Ny44NGgyMi4xOXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtNDcuODQgNzMuNDloLTIyLjE5djQ3Ljg0aDQ3Ljg0di0yMi4xOWgtMjUuNjV6IiBmaWxsPSIjNjI0YWZmIiAvPgo8L3N2Zz4K)](https://modelscope.cn/datasets?organization=Data-Juicer&page=1) +[![HuggingFace-10+ Demos](https://img.shields.io/badge/🤗HuggingFace-10+_Demos-FFD21E.svg)](https://huggingface.co/datajuicer) +[![HuggingFace-20+_Refined_Datasets](https://img.shields.io/badge/🤗HuggingFace-20+_Refined_Datasets-FFD21E.svg)](https://huggingface.co/datajuicer) + [![QualityClassifier](https://img.shields.io/badge/Tools-Quality_Classifier-saddlebrown?logo=Markdown)](tools/quality_classifier/README.md) [![AutoEvaluation](https://img.shields.io/badge/Tools-Auto_Evaluation-saddlebrown?logo=Markdown)](tools/evaluator/README.md) @@ -288,22 +291,22 @@ docker exec -it bash - [Refined recipes for fine-tuning data](configs/data_juicer_recipes/README.md#before-and-after-refining-for-alpaca-cot-dataset) ## Demos -- Introduction to Data-Juicer [[ModelScope](https://modelscope.cn/studios/Data-Juicer/overview_scan/summary)] +- Introduction to Data-Juicer [[ModelScope](https://modelscope.cn/studios/Data-Juicer/overview_scan/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/overview_scan)] - Data Visualization: - - Basic Statistics [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_statistics/summary)] - - Lexical Diversity [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_diversity/summary)] - - Operator Effect [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_op_effect/summary)] + - Basic Statistics [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_statistics/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_statistics)] + - Lexical Diversity [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_diversity/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_diversity)] + - Operator Effect [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_op_effect/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_op_effect)] - Data Processing: - - Scientific Literature (e.g. [arXiv](https://info.arxiv.org/help/bulk_data_s3.html)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sci_data/summary)] - - Programming Code (e.g. [TheStack](https://huggingface.co/datasets/bigcode/the-stack)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_code_data/summary)] - - Chinese Instruction Data (e.g. [Alpaca-CoT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sft_zh_data/summary)] + - Scientific Literature (e.g. [arXiv](https://info.arxiv.org/help/bulk_data_s3.html)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sci_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_sci_data)] + - Programming Code (e.g. [TheStack](https://huggingface.co/datasets/bigcode/the-stack)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_code_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_code_data)] + - Chinese Instruction Data (e.g. [Alpaca-CoT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sft_zh_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_cft_zh_data)] - Tool Pool: - - Dataset Splitting by Language [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_dataset_splitting_by_language/summary)] - - Quality Classifier for CommonCrawl [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_quality_classifier/summary)] - - Auto Evaluation on [HELM](https://github.com/stanford-crfm/helm) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/auto_evaluation_helm/summary)] - - Data Sampling and Mixture [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_mixture/summary)] -- Data Processing Loop [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)] -- Data Processing HPO [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_hpo/summary)] + - Dataset Splitting by Language [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_dataset_splitting_by_language/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/tool_dataset_splitting_by_language)] + - Quality Classifier for CommonCrawl [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_quality_classifier/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/tool_quality_classifier)] + - Auto Evaluation on [HELM](https://github.com/stanford-crfm/helm) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/auto_evaluation_helm/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/auto_evaluation_helm)] + - Data Sampling and Mixture [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_mixture/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_mixture)] +- Data Processing Loop [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_process_loop)] +- Data Processing HPO [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_hpo/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_process_hpo)] ## License Data-Juicer is released under Apache License 2.0. diff --git a/README_ZH.md b/README_ZH.md index 81bf11caa..ae7565de4 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -9,14 +9,17 @@ ![](https://img.shields.io/badge/license-Apache--2.0-000000.svg) [![Contributing](https://img.shields.io/badge/Contribution-welcome-brightgreen.svg)](docs/DeveloperGuide_ZH.md) -[![pypi version](https://img.shields.io/pypi/v/py-data-juicer?color=026cad)](https://pypi.org/project/py-data-juicer) -[![Docker version](https://img.shields.io/docker/v/datajuicer/data-juicer?label=Docker&color=498bdf)](https://hub.docker.com/r/datajuicer/data-juicer) +[![pypi version](https://img.shields.io/pypi/v/py-data-juicer?logo=pypi&color=026cad)](https://pypi.org/project/py-data-juicer) +[![Docker version](https://img.shields.io/docker/v/datajuicer/data-juicer?logo=docker&label=Docker&color=498bdf)](https://hub.docker.com/r/datajuicer/data-juicer) [![Document_List](https://img.shields.io/badge/Docs-English-blue?logo=Markdown)](README.md#documentation) [![文档列表](https://img.shields.io/badge/文档-中文-blue?logo=Markdown)](README_ZH.md#documentation) [![API Reference](https://img.shields.io/badge/Docs-API_Reference-blue?logo=Markdown)](https://alibaba.github.io/data-juicer/) -[![ModelScope-10+ Demos](https://img.shields.io/badge/ModelScope-10+_Demos-4e29ff.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjI0IDEyMS4zMyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCTxwYXRoIGQ9Im0wIDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtOTkuMTQgNzMuNDloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xNzYuMDkgOTkuMTRoLTI1LjY1djIyLjE5aDQ3Ljg0di00Ny44NGgtMjIuMTl6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTEyNC43OSA0Ny44NGgyNS42NXYyNS42NWgtMjUuNjV6IiBmaWxsPSIjMzZjZmQxIiAvPgoJPHBhdGggZD0ibTAgMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xOTguMjggNDcuODRoMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xOTguMjggMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xNTAuNDQgMHYyMi4xOWgyNS42NXYyNS42NWgyMi4xOXYtNDcuODR6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTczLjQ5IDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiMzNmNmZDEiIC8+Cgk8cGF0aCBkPSJtNDcuODQgMjIuMTloMjUuNjV2LTIyLjE5aC00Ny44NHY0Ny44NGgyMi4xOXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtNDcuODQgNzMuNDloLTIyLjE5djQ3Ljg0aDQ3Ljg0di0yMi4xOWgtMjUuNjV6IiBmaWxsPSIjNjI0YWZmIiAvPgo8L3N2Zz4K)](#demos) -[![ModelScope-20+_Refined_Datasets](https://img.shields.io/badge/ModelScope-20+_Refined_Datasets-4e29ff.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjI0IDEyMS4zMyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCTxwYXRoIGQ9Im0wIDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtOTkuMTQgNzMuNDloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xNzYuMDkgOTkuMTRoLTI1LjY1djIyLjE5aDQ3Ljg0di00Ny44NGgtMjIuMTl6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTEyNC43OSA0Ny44NGgyNS42NXYyNS42NWgtMjUuNjV6IiBmaWxsPSIjMzZjZmQxIiAvPgoJPHBhdGggZD0ibTAgMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xOTguMjggNDcuODRoMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xOTguMjggMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xNTAuNDQgMHYyMi4xOWgyNS42NXYyNS42NWgyMi4xOXYtNDcuODR6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTczLjQ5IDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiMzNmNmZDEiIC8+Cgk8cGF0aCBkPSJtNDcuODQgMjIuMTloMjUuNjV2LTIyLjE5aC00Ny44NHY0Ny44NGgyMi4xOXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtNDcuODQgNzMuNDloLTIyLjE5djQ3Ljg0aDQ3Ljg0di0yMi4xOWgtMjUuNjV6IiBmaWxsPSIjNjI0YWZmIiAvPgo8L3N2Zz4K)](#data-recipes) +[![ModelScope-10+ Demos](https://img.shields.io/badge/ModelScope-10+_Demos-4e29ff.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjI0IDEyMS4zMyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCTxwYXRoIGQ9Im0wIDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtOTkuMTQgNzMuNDloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xNzYuMDkgOTkuMTRoLTI1LjY1djIyLjE5aDQ3Ljg0di00Ny44NGgtMjIuMTl6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTEyNC43OSA0Ny44NGgyNS42NXYyNS42NWgtMjUuNjV6IiBmaWxsPSIjMzZjZmQxIiAvPgoJPHBhdGggZD0ibTAgMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xOTguMjggNDcuODRoMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xOTguMjggMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xNTAuNDQgMHYyMi4xOWgyNS42NXYyNS42NWgyMi4xOXYtNDcuODR6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTczLjQ5IDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiMzNmNmZDEiIC8+Cgk8cGF0aCBkPSJtNDcuODQgMjIuMTloMjUuNjV2LTIyLjE5aC00Ny44NHY0Ny44NGgyMi4xOXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtNDcuODQgNzMuNDloLTIyLjE5djQ3Ljg0aDQ3Ljg0di0yMi4xOWgtMjUuNjV6IiBmaWxsPSIjNjI0YWZmIiAvPgo8L3N2Zz4K)](https://modelscope.cn/studios?name=Data-Jiucer&page=1&sort=latest&type=1) +[![ModelScope-20+_Refined_Datasets](https://img.shields.io/badge/ModelScope-20+_Refined_Datasets-4e29ff.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjI0IDEyMS4zMyIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCTxwYXRoIGQ9Im0wIDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtOTkuMTQgNzMuNDloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xNzYuMDkgOTkuMTRoLTI1LjY1djIyLjE5aDQ3Ljg0di00Ny44NGgtMjIuMTl6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTEyNC43OSA0Ny44NGgyNS42NXYyNS42NWgtMjUuNjV6IiBmaWxsPSIjMzZjZmQxIiAvPgoJPHBhdGggZD0ibTAgMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xOTguMjggNDcuODRoMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzYyNGFmZiIgLz4KCTxwYXRoIGQ9Im0xOTguMjggMjIuMTloMjUuNjV2MjUuNjVoLTI1LjY1eiIgZmlsbD0iIzM2Y2ZkMSIgLz4KCTxwYXRoIGQ9Im0xNTAuNDQgMHYyMi4xOWgyNS42NXYyNS42NWgyMi4xOXYtNDcuODR6IiBmaWxsPSIjNjI0YWZmIiAvPgoJPHBhdGggZD0ibTczLjQ5IDQ3Ljg0aDI1LjY1djI1LjY1aC0yNS42NXoiIGZpbGw9IiMzNmNmZDEiIC8+Cgk8cGF0aCBkPSJtNDcuODQgMjIuMTloMjUuNjV2LTIyLjE5aC00Ny44NHY0Ny44NGgyMi4xOXoiIGZpbGw9IiM2MjRhZmYiIC8+Cgk8cGF0aCBkPSJtNDcuODQgNzMuNDloLTIyLjE5djQ3Ljg0aDQ3Ljg0di0yMi4xOWgtMjUuNjV6IiBmaWxsPSIjNjI0YWZmIiAvPgo8L3N2Zz4K)](https://modelscope.cn/datasets?organization=Data-Juicer&page=1) +[![HuggingFace-10+ Demos](https://img.shields.io/badge/🤗HuggingFace-10+_Demos-FFD21E.svg)](https://huggingface.co/datajuicer) +[![HuggingFace-20+_Refined_Datasets](https://img.shields.io/badge/🤗HuggingFace-20+_Refined_Datasets-FFD21E.svg)](https://huggingface.co/datajuicer) + [![QualityClassifier](https://img.shields.io/badge/Tools-Quality_Classifier-saddlebrown?logo=Markdown)](tools/quality_classifier/README_ZH.md) [![AutoEvaluation](https://img.shields.io/badge/Tools-Auto_Evaluation-saddlebrown?logo=Markdown)](tools/evaluator/README_ZH.md) @@ -270,22 +273,22 @@ docker exec -it bash ## 演示样例 -* Data-Juicer 介绍 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/overview_scan/summary)] +* Data-Juicer 介绍 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/overview_scan/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/overview_scan)] * 数据可视化: - * 基础指标统计 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_statistics/summary)] - * 词汇多样性 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_diversity/summary)] - * 算子效果 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_op_effect/summary)] + * 基础指标统计 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_statistics/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_statistics)] + * 词汇多样性 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_diversity/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_diversity)] + * 算子效果 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_op_effect/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_op_effect)] * 数据处理: - * 科学文献 (例如 [arXiv](https://info.arxiv.org/help/bulk_data_s3.html)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sci_data/summary)] - * 编程代码 (例如 [TheStack](https://huggingface.co/datasets/bigcode/the-stack)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_code_data/summary)] - * 中文指令数据 (例如 [Alpaca-CoT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sft_zh_data/summary)] + * 科学文献 (例如 [arXiv](https://info.arxiv.org/help/bulk_data_s3.html)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sci_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_sci_data)] + * 编程代码 (例如 [TheStack](https://huggingface.co/datasets/bigcode/the-stack)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_code_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_code_data)] + * 中文指令数据 (例如 [Alpaca-CoT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sft_zh_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_cft_zh_data)] * 工具池: - * 按语言分割数据集 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_dataset_splitting_by_language/summary)] - * CommonCrawl 质量分类器 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_quality_classifier/summary)] - * 基于 [HELM](https://github.com/stanford-crfm/helm) 的自动评测 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/auto_evaluation_helm/summary)] - * 数据采样及混合 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_mixture/summary)] -* 数据处理回路 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)] -* 数据处理 HPO [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_hpo/summary)] + * 按语言分割数据集 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_dataset_splitting_by_language/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/tool_dataset_splitting_by_language)] + * CommonCrawl 质量分类器 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_quality_classifier/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/tool_quality_classifier)] + * 基于 [HELM](https://github.com/stanford-crfm/helm) 的自动评测 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/auto_evaluation_helm/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/auto_evaluation_helm)] + * 数据采样及混合 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_mixture/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_mixture)] +* 数据处理回路 [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_process_loop)] +* 数据处理 HPO [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_hpo/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_process_hpo)] ## 开源协议 diff --git a/configs/data_juicer_recipes/README.md b/configs/data_juicer_recipes/README.md index 6818e4be7..f04894f91 100644 --- a/configs/data_juicer_recipes/README.md +++ b/configs/data_juicer_recipes/README.md @@ -8,30 +8,30 @@ We use simple 3-σ rule to set the hyperparameters for ops in each recipe. | subset | #samples before | #samples after | keep ratio | config link | data link | source | |----------------------|:---------------------------:|:--------------:|:----------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| arXiv | 1,724,497 | 1,655,259 | 95.99% | [redpajama-arxiv-refine.yaml](redpajama-arxiv-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary) | Redpajama | -| Books | 205,182 | 195,983 | 95.51% | [redpajama-book-refine.yaml](redpajama-book-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary) | Redpajama | -| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [redpajama-wiki-refine.yaml](redpajama-wiki-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary) | Redpajama | -| C4 | 364,868,892 | 344,491,171 | 94.42% | [redpajama-c4-refine.yaml](redpajama-c4-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [redpajama-cc-2019-30-refine.yaml](redpajama-cc-2019-30-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [redpajama-cc-2020-05-refine.yaml](redpajama-cc-2020-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [redpajama-cc-2021-04-refine.yaml](redpajama-cc-2021-04-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [redpajama-cc-2022-05-refine.yaml](redpajama-cc-2022-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [redpajama-cc-2023-06-refine.yaml](redpajama-cc-2023-06-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary) | Redpajama | -| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [redpajama-code-refine.yaml](github_code/redpajama-code-refine.yaml)
[stack-code-refine.yaml](github_code/stack-code-refine.yaml)
[redpajama-stack-code-deduplicate.yaml](github_code/redpajama-stack-code-deduplicate.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary) | Redpajama
The Stack | -| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [redpajama-pile-stackexchange-refine.yaml](redpajama-pile-stackexchange-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary) | Redpajama
The Pile | -| EuroParl | 69,814 | 61,601 | 88.23% | [pile-europarl-refine.yaml](pile-europarl-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary) | The Pile | -| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [pile-freelaw-refine.yaml](pile-freelaw-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary) | The Pile | -| HackerNews | 373,027 | 371,331 | 99.55% | [pile-hackernews-refine.yaml](pile-hackernews-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary) | The Pile | -| NIH ExPorter | 939,661 | 858,492 | 91.36% | [pile-nih-refine.yaml](pile-nih-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary) | The Pile | -| PhilPapers | 32,782 | 29,117 | 88.82% | [pile-philpaper-refine.yaml](pile-philpaper-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary) | The Pile | -| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [pile-pubmed-abstract-refine.yaml](pile-pubmed-abstract-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary) | The Pile | -| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [pile-pubmed-central-refine.yaml](pile-pubmed-central-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary) | The Pile | -| USPTO | 5,883,024 | 4,516,283 | 76.77% | [pile-uspto-refine.yaml](pile-uspto-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary) | The Pile | +| arXiv | 1,724,497 | 1,655,259 | 95.99% | [redpajama-arxiv-refine.yaml](redpajama-arxiv-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-arxiv-refined-by-data-juicer) | Redpajama | +| Books | 205,182 | 195,983 | 95.51% | [redpajama-book-refine.yaml](redpajama-book-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-book-refined-by-data-juicer) | Redpajama | +| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [redpajama-wiki-refine.yaml](redpajama-wiki-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-wiki-refined-by-data-juicer) | Redpajama | +| C4 | 364,868,892 | 344,491,171 | 94.42% | [redpajama-c4-refine.yaml](redpajama-c4-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-c4-refined-by-data-juicer) | Redpajama | +| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [redpajama-cc-2019-30-refine.yaml](redpajama-cc-2019-30-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2019-30-refined-by-data-juicer) | Redpajama | +| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [redpajama-cc-2020-05-refine.yaml](redpajama-cc-2020-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2020-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [redpajama-cc-2021-04-refine.yaml](redpajama-cc-2021-04-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2021-04-refined-by-data-juicer) | Redpajama | +| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [redpajama-cc-2022-05-refine.yaml](redpajama-cc-2022-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2022-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [redpajama-cc-2023-06-refine.yaml](redpajama-cc-2023-06-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2023-06-refined-by-data-juicer) | Redpajama | +| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [redpajama-code-refine.yaml](github_code/redpajama-code-refine.yaml)
[stack-code-refine.yaml](github_code/stack-code-refine.yaml)
[redpajama-stack-code-deduplicate.yaml](github_code/redpajama-stack-code-deduplicate.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-stack-code-refined-by-data-juicer) | Redpajama
The Stack | +| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [redpajama-pile-stackexchange-refine.yaml](redpajama-pile-stackexchange-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-pile-stackexchange-refined-by-data-juicer) | Redpajama
The Pile | +| EuroParl | 69,814 | 61,601 | 88.23% | [pile-europarl-refine.yaml](pile-europarl-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-europarl-refined-by-data-juicer) | The Pile | +| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [pile-freelaw-refine.yaml](pile-freelaw-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-freelaw-refined-by-data-juicer) | The Pile | +| HackerNews | 373,027 | 371,331 | 99.55% | [pile-hackernews-refine.yaml](pile-hackernews-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-hackernews-refined-by-data-juicer) | The Pile | +| NIH ExPorter | 939,661 | 858,492 | 91.36% | [pile-nih-refine.yaml](pile-nih-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-nih-refined-by-data-juicer) | The Pile | +| PhilPapers | 32,782 | 29,117 | 88.82% | [pile-philpaper-refine.yaml](pile-philpaper-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-philpaper-refined-by-data-juicer) | The Pile | +| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [pile-pubmed-abstract-refine.yaml](pile-pubmed-abstract-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-abstracts-refined-by-data-juicer) | The Pile | +| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [pile-pubmed-central-refine.yaml](pile-pubmed-central-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-central-refined-by-data-juicer) | The Pile | +| USPTO | 5,883,024 | 4,516,283 | 76.77% | [pile-uspto-refine.yaml](pile-uspto-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-uspto-refined-by-data-juicer) | The Pile | ## Before and after refining for Alpaca-CoT Dataset | subset | #samples before | #samples after | keep ratio | config link | data link | source | |------------------|:-------------------------:|:--------------------------------------:|:----------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [alpaca-cot-en-refine.yaml](alpaca_cot/alpaca-cot-en-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary) | [39 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | -| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [alpaca-cot-zh-refine.yaml](alpaca_cot/alpaca-cot-zh-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary) | [28 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | +| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [alpaca-cot-en-refine.yaml](alpaca_cot/alpaca-cot-en-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-en-refined-by-data-juicer) | [39 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | +| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [alpaca-cot-zh-refine.yaml](alpaca_cot/alpaca-cot-zh-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-zh-refined-by-data-juicer) | [28 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | diff --git a/configs/data_juicer_recipes/README_ZH.md b/configs/data_juicer_recipes/README_ZH.md index 12a5d6e31..af8d1d697 100644 --- a/configs/data_juicer_recipes/README_ZH.md +++ b/configs/data_juicer_recipes/README_ZH.md @@ -8,30 +8,30 @@ | 数据子集 | 完善前的样本数目 | 完善后的样本数目 | 样本保留率 | 配置链接 | 数据链接 | 来源 | |----------------------|:---------------------------:|:--------------:|:---------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------| -| arXiv | 1,724,497 | 1,655,259 | 95.99% | [redpajama-arxiv-refine.yaml](redpajama-arxiv-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary) | Redpajama | -| Books | 205,182 | 195,983 | 95.51% | [redpajama-book-refine.yaml](redpajama-book-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary) | Redpajama | -| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [redpajama-wiki-refine.yaml](redpajama-wiki-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary) | Redpajama | -| C4 | 364,868,892 | 344,491,171 | 94.42% | [redpajama-c4-refine.yaml](redpajama-c4-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [redpajama-cc-2019-30-refine.yaml](redpajama-cc-2019-30-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [redpajama-cc-2020-05-refine.yaml](redpajama-cc-2020-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [redpajama-cc-2021-04-refine.yaml](redpajama-cc-2021-04-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [redpajama-cc-2022-05-refine.yaml](redpajama-cc-2022-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [redpajama-cc-2023-06-refine.yaml](redpajama-cc-2023-06-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary) | Redpajama | -| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [redpajama-code-refine.yaml](github_code/redpajama-code-refine.yaml)
[stack-code-refine.yaml](github_code/stack-code-refine.yaml)
[redpajama-stack-code-deduplicate.yaml](github_code/redpajama-stack-code-deduplicate.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary) | Redpajama
The Stack | -| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [redpajama-pile-stackexchange-refine.yaml](redpajama-pile-stackexchange-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary) | Redpajama
The Pile | -| EuroParl | 69,814 | 61,601 | 88.23% | [pile-europarl-refine.yaml](pile-europarl-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary) | The Pile | -| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [pile-freelaw-refine.yaml](pile-freelaw-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary) | The Pile | -| HackerNews | 373,027 | 371,331 | 99.55% | [pile-hackernews-refine.yaml](pile-hackernews-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary) | The Pile | -| NIH ExPorter | 939,661 | 858,492 | 91.36% | [pile-nih-refine.yaml](pile-nih-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary) | The Pile | -| PhilPapers | 32,782 | 29,117 | 88.82% | [pile-philpaper-refine.yaml](pile-philpaper-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary) | The Pile | -| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [pile-pubmed-abstract-refine.yaml](pile-pubmed-abstract-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary) | The Pile | -| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [pile-pubmed-central-refine.yaml](pile-pubmed-central-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary) | The Pile | -| USPTO | 5,883,024 | 4,516,283 | 76.77% | [pile-uspto-refine.yaml](pile-uspto-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary) | The Pile | +| arXiv | 1,724,497 | 1,655,259 | 95.99% | [redpajama-arxiv-refine.yaml](redpajama-arxiv-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-arxiv-refined-by-data-juicer) | Redpajama | +| Books | 205,182 | 195,983 | 95.51% | [redpajama-book-refine.yaml](redpajama-book-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-book-refined-by-data-juicer) | Redpajama | +| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [redpajama-wiki-refine.yaml](redpajama-wiki-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-wiki-refined-by-data-juicer) | Redpajama | +| C4 | 364,868,892 | 344,491,171 | 94.42% | [redpajama-c4-refine.yaml](redpajama-c4-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-c4-refined-by-data-juicer) | Redpajama | +| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [redpajama-cc-2019-30-refine.yaml](redpajama-cc-2019-30-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2019-30-refined-by-data-juicer) | Redpajama | +| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [redpajama-cc-2020-05-refine.yaml](redpajama-cc-2020-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2020-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [redpajama-cc-2021-04-refine.yaml](redpajama-cc-2021-04-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2021-04-refined-by-data-juicer) | Redpajama | +| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [redpajama-cc-2022-05-refine.yaml](redpajama-cc-2022-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2022-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [redpajama-cc-2023-06-refine.yaml](redpajama-cc-2023-06-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2023-06-refined-by-data-juicer) | Redpajama | +| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [redpajama-code-refine.yaml](github_code/redpajama-code-refine.yaml)
[stack-code-refine.yaml](github_code/stack-code-refine.yaml)
[redpajama-stack-code-deduplicate.yaml](github_code/redpajama-stack-code-deduplicate.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-stack-code-refined-by-data-juicer) | Redpajama
The Stack | +| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [redpajama-pile-stackexchange-refine.yaml](redpajama-pile-stackexchange-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-pile-stackexchange-refined-by-data-juicer) | Redpajama
The Pile | +| EuroParl | 69,814 | 61,601 | 88.23% | [pile-europarl-refine.yaml](pile-europarl-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-europarl-refined-by-data-juicer) | The Pile | +| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [pile-freelaw-refine.yaml](pile-freelaw-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-freelaw-refined-by-data-juicer) | The Pile | +| HackerNews | 373,027 | 371,331 | 99.55% | [pile-hackernews-refine.yaml](pile-hackernews-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-hackernews-refined-by-data-juicer) | The Pile | +| NIH ExPorter | 939,661 | 858,492 | 91.36% | [pile-nih-refine.yaml](pile-nih-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-nih-refined-by-data-juicer) | The Pile | +| PhilPapers | 32,782 | 29,117 | 88.82% | [pile-philpaper-refine.yaml](pile-philpaper-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-philpaper-refined-by-data-juicer) | The Pile | +| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [pile-pubmed-abstract-refine.yaml](pile-pubmed-abstract-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-abstracts-refined-by-data-juicer) | The Pile | +| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [pile-pubmed-central-refine.yaml](pile-pubmed-central-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-central-refined-by-data-juicer) | The Pile | +| USPTO | 5,883,024 | 4,516,283 | 76.77% | [pile-uspto-refine.yaml](pile-uspto-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-uspto-refined-by-data-juicer) | The Pile | ## 完善前后的Alpaca-CoT数据集 | 数据子集 | 完善前的样本数目 | 完善后的样本数目 | 样本保留率 | 配置链接 | 数据链接 | 来源 | |-------------------|:------------------------:|:----------------------------------:|:---------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------| -| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [alpaca-cot-en-refine.yaml](alpaca_cot/alpaca-cot-en-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary) | [来自Alpaca-CoT的39个子集](alpaca_cot/README_ZH.md#完善的-alpaca-cot-数据集元信息) | -| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [alpaca-cot-zh-refine.yaml](alpaca_cot/alpaca-cot-zh-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary) | [来自Alpaca-CoT的28个子集](alpaca_cot/README_ZH.md#完善的-alpaca-cot-数据集元信息) | +| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [alpaca-cot-en-refine.yaml](alpaca_cot/alpaca-cot-en-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-en-refined-by-data-juicer) | [来自Alpaca-CoT的39个子集](alpaca_cot/README_ZH.md#完善的-alpaca-cot-数据集元信息) | +| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [alpaca-cot-zh-refine.yaml](alpaca_cot/alpaca-cot-zh-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-zh-refined-by-data-juicer) | [来自Alpaca-CoT的28个子集](alpaca_cot/README_ZH.md#完善的-alpaca-cot-数据集元信息) | diff --git a/demos/auto_evaluation_helm/README_ZH.md b/demos/auto_evaluation_helm/README_ZH.md new file mode 100644 index 000000000..b6b50d24e --- /dev/null +++ b/demos/auto_evaluation_helm/README_ZH.md @@ -0,0 +1,109 @@ +## 自动化评测:HELM 评测及可视化 + +### 什么是自动化评测 + +这里的自动化评测是指对模型训练过程中得到的检查点自动使用评测数据集执行评测并记录评测结果。 + +### 为什么要自动化评测 + +在大模型训练过程中仅通过训练 loss 难以准确评估模型的实际性能,需要使用多种评测数据集从各维度评价模型的能力,实时持续监控各项指标随训练迭代的变化情况,并与其他基线模型做比较,从而判断模型是否还有继续训练的价值,节省训练开销。 + +但上述的评测流程重复繁琐且易于出错(例如:检查是否有新检查点、运行评测、记录评测结果、结果可视化等),而本自动化评测工具则能够提供一键式的解决方案,节省大量人力成本。 + +将自动化评测与数据预处理结合可及时通过评测结果判断数据预处理阶段配置的合理性,形成反馈循环,更快地找出更合理的数据预处理方法。 + +### 如何使用自动化评测:以 HELM 和 Megatron-LM 为例 + +> - HELM 是 Stanford 开源的一套评测框架,包含了丰富的测试数据集以及多种评测指标,现已评测了超过 50 种可公开访问的大模型 +> - Megatron-LM 是 Nvidia 开源的 Transformer 训练框架,支持大规模分布式训练且性能极高,是多个知名大模型训练框架 (GPT-Neox, Megatron-Deepspeed等) 的基础 + +本节介绍如何使用本工具中的 HELM 评测框架自动化评测 Megatron-LM 训练得到的 GPT2 模型,运行该样例需要至少一张 V100 或其他更高规格的显卡,该样例在计算资源允许的情况下可以扩展支持更大的模型。 + +#### 1. 准备环境 + +由于 HELM 和 Megatron-LM 的依赖项繁杂,为了减少安装过程中遇到的依赖问题,推荐基于 NGC 的 Pytorch 容器 (`nvcr.io/nvidia/pytorch:22.12-py3`) 构建环境。 + +假设您的数据集 jsonl 文件路径为 `/dataset/dataset.jsonl`,Data-Juicer 的代码路径为 `/code/data-juicer`,只需执行如下指令: + +```shell +docker pull nvcr.io/nvidia/pytorch:22.12-py3 +docker run --gpus all --ipc=host --ulimit memlock=-1 -it --rm -v /dataset:/workspace/data -v /code/data-juicer:/worksapce/data-juicer nvcr.io/nvidia/pytorch:22.12-py3 +``` +dokcer 容器成功运行后在容器内运行安装脚本并登录 wandb: + +```shell +cd /workspace/data-juicer/thirdparty +./setup_megatron.sh +./setup_helm.sh +wandb login +``` + +安装完成后在容器外运行如下指令将容器保存下来方便后续使用,其中的 `container_id` 可通过 `docker ps` 获取 + +```shell +docker commit data-juicer-eval +``` + +#### 2. 将数据集预处理为 Megatron-LM 可识别的格式 + +进入 Megatron-LM 目录并执行数据预处理脚本,该脚本会将 data-juicer 处理好的 jsonline(假设路径为 `/workspace/data/dataset.jsonl`)文件转化为二进制格式,并保存为 `/workspace.data/dataset_text_document.bin` 和 `/workspace.data/dataset_text_document.idx` 两个文件。 + +```shell +cd /workspace/data-juicer/thirdparty/Megatron-LM +python tools/preprocess_data.py \ + --input /workspace/data/dataset.jsonl \ + --output-prefix dataset \ + --vocab /workspace/data-juicer/demos/gpt2-vocab.json \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --split-sentences +``` + + +#### 3. 启动 Megatron-LM 训练 + +进入 Megatron-LM 目录并执行如下指令 + +```shell +cd /workspace/data-juicer/thirdparty/Megatron-LM +nohup bash /workspace/data-juicer/demos/auto_eval_helm/pretrain_example.sh > train.log 2>&1 & +``` + +> `pretrain_example.sh` 会执行 GPT2 模型的训练,从 `/workspace/data` 路径获取 `.bin` 和 `.idx` 二进制数据集文件,训练 200000 个 iteration,并每隔 2000 个 iteration 将模型以检查点形式保存在 `/workspace/data/checkpoints/GPT2` 目录下。 +> 可通过修改 `pretrain_example.sh` 来调整模型的规模、数据集路径、检查点路径等参数,更详细的配置信息请参考 [Megatron-LM 官方仓库](https://github.com/NVIDIA/Megatron-LM)。 + +#### 4. 启动自动化评测 + +进入 data-juicer 的自动评测工具库目录并执行如下指令: + +```shell +cd /workspace/data-juicer/tools/eval +python evaluator.py --config /workspace/data-juicer/demos/evalutor.yaml --begin-iteration 2000 --end-iteration 200000 --interation-interval 2000 --check-interval 30 +``` + +该脚本会每隔 30 分钟检测一次 `/workspace/data/checkpoints/GPT2` 目录,并从 2000 iteration 开始每隔 2000 iteraion 对检查点执行一次 HELM 评测并将评测结果记录至 wandb,直到评测完 200000 iteraion 对应的检查点为止,您可以在 wandb 上查看已完成的评测结果,下图展示了模型训练到 140000 iteration 时 wandb 上的可视化展示结果。 + +![训练过程中的评测结果展示](imgs/eval-02.png) + +> 本示例运行的 HELM 测试集配置位于 `/workspace/data-juicer/demos/helm_spec_template.conf` 中,这里仅选用了 MMLU 的一个子集、boolq、narrative_qa 以及 hellaswag 作为样例,完整的测试集配置位于 `/workspace/data-juicer/tools/eval/config/helm_spec_template.conf` 中。 + + +#### 5. 汇总评测结果 + +为了体现模型训练效果,可以借助 `wandb_writer.py` 将多个模型的评测结果汇总到同一个排行榜上进行比较。 + +首先,将基线模型的各项评测结果直接记录到 wandb: + +```shell +cd /workspace/data-juicer/tools/eval/recorder +python wandb_writer.py --config /workspace/data-juicer/demos/baselines.yaml +``` + +在确保所有参与排行榜的模型的评测结果都已经记录到 wandb 之后,使用 `leaderboard.yaml` 构建排行榜: + +```shell +cd /workspace/data-juicer/tools/eval/recorder +python wandb_writer.py --config /workspace/data-juicer/demos/leaderboard.yaml +``` + +![排行榜](imgs/eval-01.png) diff --git a/demos/auto_evaluation_helm/app.py b/demos/auto_evaluation_helm/app.py new file mode 100644 index 000000000..a9f7edbd2 --- /dev/null +++ b/demos/auto_evaluation_helm/app.py @@ -0,0 +1,54 @@ +import os +import re +import streamlit as st + +class Visualize: + + @staticmethod + def setup(): + st.set_page_config( + page_title='Data-Juicer', + page_icon=':smile', + layout='wide', + # initial_sidebar_state="expanded", + ) + + readme_link = 'https://github.com/alibaba/data-juicer' + st.markdown( + '#
Data-Juicer
', + unsafe_allow_html=True, + ) + st.markdown( + f'
A One-Stop Data Processing System for \ + Large Language Models, \ + see more details in our Github
', + unsafe_allow_html=True, + ) + + @staticmethod + def visualize(): + Visualize.setup() + +def main(): + + def make_image(line): + pattern = r'!\[(.*?)\]\((.*?)\)' + maches = re.findall(pattern, line) + st.image(maches[0][1], output_format='png', use_column_width=True) + + Visualize.visualize() + buffer = [] + with open("README_ZH.md", 'r', encoding='utf-8') as f: + lines = f.readlines() + for line in lines: + if "imgs/" in line: + st.markdown('\n'.join(buffer)) + make_image(line) + buffer.clear() + else: + buffer.append(line) + st.markdown('\n'.join(buffer)) + # hello() + +if __name__ == '__main__': + main() diff --git a/demos/auto_evaluation_helm/imgs/data-juicer.png b/demos/auto_evaluation_helm/imgs/data-juicer.png new file mode 100644 index 000000000..ffa59a3db Binary files /dev/null and b/demos/auto_evaluation_helm/imgs/data-juicer.png differ diff --git a/demos/auto_evaluation_helm/imgs/eval-01.png b/demos/auto_evaluation_helm/imgs/eval-01.png new file mode 100644 index 000000000..382bf2743 Binary files /dev/null and b/demos/auto_evaluation_helm/imgs/eval-01.png differ diff --git a/demos/auto_evaluation_helm/imgs/eval-02.png b/demos/auto_evaluation_helm/imgs/eval-02.png new file mode 100644 index 000000000..71ca49bb2 Binary files /dev/null and b/demos/auto_evaluation_helm/imgs/eval-02.png differ diff --git a/demos/data_mixture/app.py b/demos/data_mixture/app.py index 6c6a2773b..ec649cdb7 100644 --- a/demos/data_mixture/app.py +++ b/demos/data_mixture/app.py @@ -44,7 +44,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) diff --git a/demos/data_process_hpo/app.py b/demos/data_process_hpo/app.py new file mode 100644 index 000000000..bc7a78baf --- /dev/null +++ b/demos/data_process_hpo/app.py @@ -0,0 +1,46 @@ +import os +import streamlit as st + +class Visualize: + + @staticmethod + def setup(): + st.set_page_config( + page_title='Data-Juicer', + page_icon=':smile', + #layout='wide', + # initial_sidebar_state="expanded", + ) + + readme_link = 'https://github.com/alibaba/data-juicer' + st.markdown( + '
Data-Juicer \ +
', + unsafe_allow_html=True, + ) + st.markdown( + f'
A One-Stop Data Processing System for \ + Large Language Models, \ + see more details in our Github
', + unsafe_allow_html=True, + ) + + @staticmethod + def visualize(): + Visualize.setup() + +def main(): + def hello(): + + st.image('imgs/data-juicer.png', output_format='png', use_column_width = True) + demo = 'The demo is coming soon😊' + st.markdown( + f'
{demo} \ +
', + unsafe_allow_html=True, + ) + Visualize.visualize() + hello() + +if __name__ == '__main__': + main() diff --git a/demos/data_process_hpo/imgs/data-juicer.png b/demos/data_process_hpo/imgs/data-juicer.png new file mode 100644 index 000000000..ffa59a3db Binary files /dev/null and b/demos/data_process_hpo/imgs/data-juicer.png differ diff --git a/demos/data_process_loop/app.py b/demos/data_process_loop/app.py index 901efd9b9..5a2bafd3b 100644 --- a/demos/data_process_loop/app.py +++ b/demos/data_process_loop/app.py @@ -178,7 +178,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) diff --git a/demos/data_visualization_diversity/app.py b/demos/data_visualization_diversity/app.py index 2d95bd65e..069f3c91c 100644 --- a/demos/data_visualization_diversity/app.py +++ b/demos/data_visualization_diversity/app.py @@ -133,7 +133,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) diff --git a/demos/data_visualization_op_effect/app.py b/demos/data_visualization_op_effect/app.py index c5a944cf5..6f4600d0d 100644 --- a/demos/data_visualization_op_effect/app.py +++ b/demos/data_visualization_op_effect/app.py @@ -177,7 +177,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) diff --git a/demos/data_visualization_statistics/app.py b/demos/data_visualization_statistics/app.py index e4f14f1bc..eecce2856 100644 --- a/demos/data_visualization_statistics/app.py +++ b/demos/data_visualization_statistics/app.py @@ -130,7 +130,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) diff --git a/demos/overview_scan/app.py b/demos/overview_scan/app.py index d3fbbc000..7ac664eb7 100644 --- a/demos/overview_scan/app.py +++ b/demos/overview_scan/app.py @@ -85,13 +85,13 @@ op_desc = ''' The operators in Data-Juicer are categorized into 5 types. -| Type | Number | Description | -|-----------------------------------|:------:|-------------| -| Formatter | 7 | Discovers, loads, and cannibalizes source data | -| Mapper | 17 | Edits and transforms samples | -| Filter | 15 | Filters out low-quality samples | -| Deduplicator | 3 | Detects and removes duplicate samples | -| Selector | 2 | Selects top samples based on ranking | +| Type | Number | Description | +|-----------------------------------|:------:|-------------------------------------------------| +| Formatter | 7 | Discovers, loads, and canonicalizes source data | +| Mapper | 19 | Edits and transforms samples | +| Filter | 16 | Filters out low-quality samples | +| Deduplicator | 3 | Detects and removes duplicate samples | +| Selector | 2 | Selects top samples based on ranking | ''' op_list_desc = { @@ -111,42 +111,46 @@ ''' | Operator | Domain | Lang | Description | |-----------------------------------------------|--------------------|--------|----------------------------------------------------------------------------------------------------------------| -| remove_header_mapper | LaTeX | en, zh | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names | -| remove_bibliography_mapper | LaTeX | en, zh | Removes the bibliography of TeX documents | -| expand_macro_mapper | LaTeX | en, zh | Expands macros usually defined at the top of TeX documents | -| whitespace_normalization_mapper | General | en, zh | Normalizes various Unicode whitespaces to the normal ASCII space (U+0020) | -| punctuation_normalization_mapper | General | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents | -| fix_unicode_mapper | General | en, zh | Fixes broken Unicodes (by [ftfy](https://ftfy.readthedocs.io/)) | -| sentence_split_mapper | General | en | Splits and reorganizes sentences according to semantics | -| remove_long_words_mapper | General | en, zh | Removes words with length outside the specified range | +| clean_copyright_mapper | Code | en, zh | Removes copyright notice at the beginning of code files (:warning: must contain the word *copyright*) | +| clean_email_mapper | General | en, zh | Removes email information | +| clean_html_mapper | General | en, zh | Removes HTML tags and returns plain text of all the nodes | +| clean_ip_mapper | General | en, zh | Removes IP addresses | +| clean_links_mapper | General, Code | en, zh | Removes links, such as those starting with http or ftp | +| expand_macro_mapper | LaTeX | en, zh | Expands macros usually defined at the top of TeX documents | +| fix_unicode_mapper | General | en, zh | Fixes broken Unicodes (by [ftfy](https://ftfy.readthedocs.io/)) | +| nlpaug_en_mapper | General | en | Simply augment texts in English based on the `nlpaug` library | +| nlpcda_zh_mapper | General | zh | Simply augment texts in Chinese based on the `nlpcda` library | +| punctuation_normalization_mapper | General | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents | +| remove_bibliography_mapper | LaTeX | en, zh | Removes the bibliography of TeX documents | +| remove_comments_mapper | LaTeX | en, zh | Removes the comments of TeX documents | +| remove_header_mapper | LaTeX | en, zh | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names | +| remove_long_words_mapper | General | en, zh | Removes words with length outside the specified range | +| remove_specific_chars_mapper | General | en, zh | Removes any user-specified characters or substrings | +| remove_table_text_mapper | General, Financial | en | Detects and removes possible table contents (:warning: relies on regular expression matching and thus fragile) | | remove_words_with_incorrect_
substrings_mapper | General | en, zh | Removes words containing specified substrings | -| clean_email_mapper | General | en, zh | Removes email information | -| clean_ip_mapper | General | en, zh | Removes IP addresses | -| clean_links_mapper | General, Code | en, zh | Removes links, such as those starting with http or ftp | -| clean_html_mapper | General | en, zh | Removes HTML tags and returns plain text of all the nodes | -| remove_table_text_mapper | General, Financial | en | Detects and removes possible table contents (:warning: relies on regular expression matching and thus fragile) | -| clean_copyright_mapper | Code | en, zh | Removes copyright notice at the beginning of code files (:warning: must contain the word *copyright*) | -| remove_specific_chars_mapper | General | en, zh | Removes any user-specified characters or substrings | +| sentence_split_mapper | General | en | Splits and reorganizes sentences according to semantics | +| whitespace_normalization_mapper | General | en, zh | Normalizes various Unicode whitespaces to the normal ASCII space (U+0020) | ''', 'filter': ''' | Operator | Domain | Lang | Description | |--------------------------------|---------|--------|--------------------------------------------------------------------------------------------| -| word_num_filter | General | en, zh | Keeps samples with word count within the specified range | -| stopwords_filter | General | en, zh | Keeps samples with stopword ratio above the specified threshold | -| flagged_words_filter | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold | +| alphanumeric_filter | General | en, zh | Keeps samples with alphanumeric ratio within the specified range | +| average_line_length_filter | Code | en, zh | Keeps samples with average line length within the specified range | | character_repetition_filter | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range | -| word_repetition_filter | General | en, zh | Keeps samples with word-level n-gram repetition ratio within the specified range | -| special_characters_filter | General | en, zh | Keeps samples with special-char ratio within the specified range | +| flagged_words_filter | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold | | language_id_score_filter | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score | -| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold | | maximum_line_length_filter | Code | en, zh | Keeps samples with maximum line length within the specified range | -| average_line_length_filter | Code | en, zh | Keeps samples with average line length within the specified range | -| alphanumeric_filter | General | en, zh | Keeps samples with alphanumeric ratio within the specified range | -| text_length_filter | General | en, zh | Keeps samples with total text length within the specified range | -| suffix_filter | General | en, zh | Keeps samples with specified suffixes | +| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold | +| special_characters_filter | General | en, zh | Keeps samples with special-char ratio within the specified range | | specified_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified targets | | specified_numeric_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified range (for numeric types) | +| stopwords_filter | General | en, zh | Keeps samples with stopword ratio above the specified threshold | +| suffix_filter | General | en, zh | Keeps samples with specified suffixes | +| text_length_filter | General | en, zh | Keeps samples with total text length within the specified range | +| token_num_filter | General | en, zh | Keeps samples with token count within the specified range | +| word_num_filter | General | en, zh | Keeps samples with word count within the specified range | +| word_repetition_filter | General | en, zh | Keeps samples with word-level n-gram repetition ratio within the specified range | ''', 'deduplicator': ''' @@ -166,21 +170,22 @@ } demo_desc = ''' +- Introduction to Data-Juicer [[ModelScope](https://modelscope.cn/studios/Data-Juicer/overview_scan/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/overview_scan)] - Data Visualization: - - Basic Statistics [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_statistics/summary)] - - Lexical Diversity [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_diversity/summary)] - - Operator Effect [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_op_effect/summary)] + - Basic Statistics [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_statistics/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_statistics)] + - Lexical Diversity [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_diversity/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_diversity)] + - Operator Effect [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_visulization_op_effect/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_visualization_op_effect)] - Data Processing: - - Scientific Literature (e.g. [arXiv](https://info.arxiv.org/help/bulk_data_s3.html)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sci_data/summary)] - - Programming Code (e.g. [TheStack](https://huggingface.co/datasets/bigcode/the-stack)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_code_data/summary)] - - Chinese Instruction Data (e.g. [Alpaca-CoT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sft_zh_data/summary)] + - Scientific Literature (e.g. [arXiv](https://info.arxiv.org/help/bulk_data_s3.html)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sci_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_sci_data)] + - Programming Code (e.g. [TheStack](https://huggingface.co/datasets/bigcode/the-stack)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_code_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_code_data)] + - Chinese Instruction Data (e.g. [Alpaca-CoT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/process_sft_zh_data/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/process_cft_zh_data)] - Tool Pool: - - Dataset Splitting by Language [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_dataset_splitting_by_language/summary)] - - Quality Classifier for CommonCrawl [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_quality_classifier/summary)] - - Auto Evaluation on [HELM](https://github.com/stanford-crfm/helm) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/auto_evaluation_helm/summary)] - - Data Sampling and Mixture [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_mixture/summary)] -- Data Process Loop [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)] -- Data Process HPO [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_hpo/summary)] + - Dataset Splitting by Language [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_dataset_splitting_by_language/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/tool_dataset_splitting_by_language)] + - Quality Classifier for CommonCrawl [[ModelScope](https://modelscope.cn/studios/Data-Juicer/tool_quality_classifier/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/tool_quality_classifier)] + - Auto Evaluation on [HELM](https://github.com/stanford-crfm/helm) [[ModelScope](https://modelscope.cn/studios/Data-Juicer/auto_evaluation_helm/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/auto_evaluation_helm)] + - Data Sampling and Mixture [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_mixture/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_mixture)] +- Data Processing Loop [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_loop/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_process_loop)] +- Data Processing HPO [[ModelScope](https://modelscope.cn/studios/Data-Juicer/data_process_hpo/summary)] [[HuggingFace](https://huggingface.co/spaces/datajuicer/data_process_hpo)] ''' diff --git a/demos/process_cft_zh_data/app.py b/demos/process_cft_zh_data/app.py index 85c36ec43..d4d1aae0c 100644 --- a/demos/process_cft_zh_data/app.py +++ b/demos/process_cft_zh_data/app.py @@ -8,6 +8,7 @@ from data_juicer.config import init_configs from data_juicer.core import Analyser, Executor +from data_juicer.utils.constant import HashKeys demo_path = os.path.dirname(os.path.abspath(__file__)) project_path = os.path.dirname(os.path.dirname(demo_path)) @@ -39,33 +40,33 @@ | subset | #samples before | #samples after | keep ratio |data link | source | |----------------------|:---------------------------:|:--------------:|:----------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| arXiv | 1,724,497 | 1,655,259 | 95.99% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary) | Redpajama | -| Books | 205,182 | 195,983 | 95.51% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary) | Redpajama | -| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary) | Redpajama | -| C4 | 364,868,892 | 344,491,171 | 94.42% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary) | Redpajama | -| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary) | Redpajama
The Stack | -| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary) | Redpajama
The Pile | -| EuroParl | 69,814 | 61,601 | 88.23% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary) | The Pile | -| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary) | The Pile | -| HackerNews | 373,027 | 371,331 | 99.55% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary) | The Pile | -| NIH ExPorter | 939,661 | 858,492 | 91.36% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary) | The Pile | -| PhilPapers | 32,782 | 29,117 | 88.82% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary) | The Pile | -| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary) | The Pile | -| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary) | The Pile | -| USPTO | 5,883,024 | 4,516,283 | 76.77% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary) | The Pile | +| arXiv | 1,724,497 | 1,655,259 | 95.99% | [redpajama-arxiv-refine.yaml](redpajama-arxiv-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-arxiv-refined-by-data-juicer) | Redpajama | +| Books | 205,182 | 195,983 | 95.51% | [redpajama-book-refine.yaml](redpajama-book-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-book-refined-by-data-juicer) | Redpajama | +| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [redpajama-wiki-refine.yaml](redpajama-wiki-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-wiki-refined-by-data-juicer) | Redpajama | +| C4 | 364,868,892 | 344,491,171 | 94.42% | [redpajama-c4-refine.yaml](redpajama-c4-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-c4-refined-by-data-juicer) | Redpajama | +| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [redpajama-cc-2019-30-refine.yaml](redpajama-cc-2019-30-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2019-30-refined-by-data-juicer) | Redpajama | +| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [redpajama-cc-2020-05-refine.yaml](redpajama-cc-2020-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2020-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [redpajama-cc-2021-04-refine.yaml](redpajama-cc-2021-04-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2021-04-refined-by-data-juicer) | Redpajama | +| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [redpajama-cc-2022-05-refine.yaml](redpajama-cc-2022-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2022-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [redpajama-cc-2023-06-refine.yaml](redpajama-cc-2023-06-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2023-06-refined-by-data-juicer) | Redpajama | +| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [redpajama-code-refine.yaml](github_code/redpajama-code-refine.yaml)
[stack-code-refine.yaml](github_code/stack-code-refine.yaml)
[redpajama-stack-code-deduplicate.yaml](github_code/redpajama-stack-code-deduplicate.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-stack-code-refined-by-data-juicer) | Redpajama
The Stack | +| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [redpajama-pile-stackexchange-refine.yaml](redpajama-pile-stackexchange-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-pile-stackexchange-refined-by-data-juicer) | Redpajama
The Pile | +| EuroParl | 69,814 | 61,601 | 88.23% | [pile-europarl-refine.yaml](pile-europarl-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-europarl-refined-by-data-juicer) | The Pile | +| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [pile-freelaw-refine.yaml](pile-freelaw-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-freelaw-refined-by-data-juicer) | The Pile | +| HackerNews | 373,027 | 371,331 | 99.55% | [pile-hackernews-refine.yaml](pile-hackernews-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-hackernews-refined-by-data-juicer) | The Pile | +| NIH ExPorter | 939,661 | 858,492 | 91.36% | [pile-nih-refine.yaml](pile-nih-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-nih-refined-by-data-juicer) | The Pile | +| PhilPapers | 32,782 | 29,117 | 88.82% | [pile-philpaper-refine.yaml](pile-philpaper-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-philpaper-refined-by-data-juicer) | The Pile | +| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [pile-pubmed-abstract-refine.yaml](pile-pubmed-abstract-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-abstracts-refined-by-data-juicer) | The Pile | +| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [pile-pubmed-central-refine.yaml](pile-pubmed-central-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-central-refined-by-data-juicer) | The Pile | +| USPTO | 5,883,024 | 4,516,283 | 76.77% | [pile-uspto-refine.yaml](pile-uspto-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-uspto-refined-by-data-juicer) | The Pile | ## Before and after refining for Alpaca-CoT Dataset -| subset | #samples before | #samples after | keep ratio |data link | source | -|----------------------|:---------------------------:|:--------------:|:----------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary) | [39 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | -| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary) | [28 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | +| subset | #samples before | #samples after | keep ratio | config link | data link | source | +|------------------|:-------------------------:|:--------------------------------------:|:----------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [alpaca-cot-en-refine.yaml](alpaca_cot/alpaca-cot-en-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-en-refined-by-data-juicer) | [39 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | +| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [alpaca-cot-zh-refine.yaml](alpaca_cot/alpaca-cot-zh-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-zh-refined-by-data-juicer) | [28 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | ''' @@ -130,7 +131,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) @@ -200,8 +201,8 @@ def display_tracer_result(op_type, prefix, files): # simhash value may exceed the range of # integer type of streamlit if 'simhash_deduplicator' in filename: - obj['dup1'].pop('simhash') - obj['dup2'].pop('simhash') + obj['dup1'].pop(HashKeys.simhash) + obj['dup2'].pop(HashKeys.simhash) st.dataframe(objs) diff --git a/demos/process_code_data/app.py b/demos/process_code_data/app.py index ddfaedc08..26b8a6606 100644 --- a/demos/process_code_data/app.py +++ b/demos/process_code_data/app.py @@ -8,6 +8,7 @@ from data_juicer.config import init_configs from data_juicer.core import Analyser, Executor +from data_juicer.utils.constant import HashKeys demo_path = os.path.dirname(os.path.abspath(__file__)) project_path = os.path.dirname(os.path.dirname(demo_path)) @@ -39,33 +40,33 @@ | subset | #samples before | #samples after | keep ratio |data link | source | |----------------------|:---------------------------:|:--------------:|:----------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| arXiv | 1,724,497 | 1,655,259 | 95.99% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary) | Redpajama | -| Books | 205,182 | 195,983 | 95.51% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary) | Redpajama | -| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary) | Redpajama | -| C4 | 364,868,892 | 344,491,171 | 94.42% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary) | Redpajama | -| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary) | Redpajama
The Stack | -| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary) | Redpajama
The Pile | -| EuroParl | 69,814 | 61,601 | 88.23% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary) | The Pile | -| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary) | The Pile | -| HackerNews | 373,027 | 371,331 | 99.55% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary) | The Pile | -| NIH ExPorter | 939,661 | 858,492 | 91.36% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary) | The Pile | -| PhilPapers | 32,782 | 29,117 | 88.82% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary) | The Pile | -| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary) | The Pile | -| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary) | The Pile | -| USPTO | 5,883,024 | 4,516,283 | 76.77% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary) | The Pile | +| arXiv | 1,724,497 | 1,655,259 | 95.99% | [redpajama-arxiv-refine.yaml](redpajama-arxiv-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-arxiv-refined-by-data-juicer) | Redpajama | +| Books | 205,182 | 195,983 | 95.51% | [redpajama-book-refine.yaml](redpajama-book-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-book-refined-by-data-juicer) | Redpajama | +| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [redpajama-wiki-refine.yaml](redpajama-wiki-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-wiki-refined-by-data-juicer) | Redpajama | +| C4 | 364,868,892 | 344,491,171 | 94.42% | [redpajama-c4-refine.yaml](redpajama-c4-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-c4-refined-by-data-juicer) | Redpajama | +| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [redpajama-cc-2019-30-refine.yaml](redpajama-cc-2019-30-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2019-30-refined-by-data-juicer) | Redpajama | +| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [redpajama-cc-2020-05-refine.yaml](redpajama-cc-2020-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2020-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [redpajama-cc-2021-04-refine.yaml](redpajama-cc-2021-04-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2021-04-refined-by-data-juicer) | Redpajama | +| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [redpajama-cc-2022-05-refine.yaml](redpajama-cc-2022-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2022-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [redpajama-cc-2023-06-refine.yaml](redpajama-cc-2023-06-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2023-06-refined-by-data-juicer) | Redpajama | +| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [redpajama-code-refine.yaml](github_code/redpajama-code-refine.yaml)
[stack-code-refine.yaml](github_code/stack-code-refine.yaml)
[redpajama-stack-code-deduplicate.yaml](github_code/redpajama-stack-code-deduplicate.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-stack-code-refined-by-data-juicer) | Redpajama
The Stack | +| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [redpajama-pile-stackexchange-refine.yaml](redpajama-pile-stackexchange-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-pile-stackexchange-refined-by-data-juicer) | Redpajama
The Pile | +| EuroParl | 69,814 | 61,601 | 88.23% | [pile-europarl-refine.yaml](pile-europarl-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-europarl-refined-by-data-juicer) | The Pile | +| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [pile-freelaw-refine.yaml](pile-freelaw-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-freelaw-refined-by-data-juicer) | The Pile | +| HackerNews | 373,027 | 371,331 | 99.55% | [pile-hackernews-refine.yaml](pile-hackernews-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-hackernews-refined-by-data-juicer) | The Pile | +| NIH ExPorter | 939,661 | 858,492 | 91.36% | [pile-nih-refine.yaml](pile-nih-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-nih-refined-by-data-juicer) | The Pile | +| PhilPapers | 32,782 | 29,117 | 88.82% | [pile-philpaper-refine.yaml](pile-philpaper-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-philpaper-refined-by-data-juicer) | The Pile | +| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [pile-pubmed-abstract-refine.yaml](pile-pubmed-abstract-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-abstracts-refined-by-data-juicer) | The Pile | +| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [pile-pubmed-central-refine.yaml](pile-pubmed-central-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-central-refined-by-data-juicer) | The Pile | +| USPTO | 5,883,024 | 4,516,283 | 76.77% | [pile-uspto-refine.yaml](pile-uspto-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-uspto-refined-by-data-juicer) | The Pile | ## Before and after refining for Alpaca-CoT Dataset -| subset | #samples before | #samples after | keep ratio |data link | source | -|----------------------|:---------------------------:|:--------------:|:----------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary) | [39 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | -| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary) | [28 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | +| subset | #samples before | #samples after | keep ratio | config link | data link | source | +|------------------|:-------------------------:|:--------------------------------------:|:----------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [alpaca-cot-en-refine.yaml](alpaca_cot/alpaca-cot-en-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-en-refined-by-data-juicer) | [39 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | +| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [alpaca-cot-zh-refine.yaml](alpaca_cot/alpaca-cot-zh-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-zh-refined-by-data-juicer) | [28 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | ''' @@ -130,7 +131,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) @@ -199,8 +200,8 @@ def display_tracer_result(op_type, prefix, files): # simhash value may exceed the range of # integer type of streamlit if 'simhash_deduplicator' in filename: - obj['dup1'].pop('simhash') - obj['dup2'].pop('simhash') + obj['dup1'].pop(HashKeys.simhash) + obj['dup2'].pop(HashKeys.simhash) st.dataframe(objs) diff --git a/demos/process_sci_data/app.py b/demos/process_sci_data/app.py index c98141851..3bef7de88 100644 --- a/demos/process_sci_data/app.py +++ b/demos/process_sci_data/app.py @@ -8,6 +8,7 @@ from data_juicer.config import init_configs from data_juicer.core import Analyser, Executor +from data_juicer.utils.constant import HashKeys demo_path = os.path.dirname(os.path.abspath(__file__)) project_path = os.path.dirname(os.path.dirname(demo_path)) @@ -39,33 +40,33 @@ | subset | #samples before | #samples after | keep ratio |data link | source | |----------------------|:---------------------------:|:--------------:|:----------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| arXiv | 1,724,497 | 1,655,259 | 95.99% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary) | Redpajama | -| Books | 205,182 | 195,983 | 95.51% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary) | Redpajama | -| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary) | Redpajama | -| C4 | 364,868,892 | 344,491,171 | 94.42% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary) | Redpajama | -| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary) | Redpajama | -| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary) | Redpajama
The Stack | -| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary) | Redpajama
The Pile | -| EuroParl | 69,814 | 61,601 | 88.23% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary) | The Pile | -| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary) | The Pile | -| HackerNews | 373,027 | 371,331 | 99.55% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary) | The Pile | -| NIH ExPorter | 939,661 | 858,492 | 91.36% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary) | The Pile | -| PhilPapers | 32,782 | 29,117 | 88.82% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary) | The Pile | -| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary) | The Pile | -| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary) | The Pile | -| USPTO | 5,883,024 | 4,516,283 | 76.77% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary) | The Pile | +| arXiv | 1,724,497 | 1,655,259 | 95.99% | [redpajama-arxiv-refine.yaml](redpajama-arxiv-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-arxiv-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-arxiv-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-arxiv-refined-by-data-juicer) | Redpajama | +| Books | 205,182 | 195,983 | 95.51% | [redpajama-book-refine.yaml](redpajama-book-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-book-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-book-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-book-refined-by-data-juicer) | Redpajama | +| Wikipedia | 29,834,171 | 26,990,659 | 90.47% | [redpajama-wiki-refine.yaml](redpajama-wiki-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-wiki-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-wiki-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-wiki-refined-by-data-juicer) | Redpajama | +| C4 | 364,868,892 | 344,491,171 | 94.42% | [redpajama-c4-refine.yaml](redpajama-c4-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-c4-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-c4-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-c4-refined-by-data-juicer) | Redpajama | +| Common Crawl 2019-30 | 81,085,420 | 36,557,283 | 45.08% | [redpajama-cc-2019-30-refine.yaml](redpajama-cc-2019-30-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2019-30-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2019-30-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2019-30-refined-by-data-juicer) | Redpajama | +| Common Crawl 2020-05 | 90,850,492 | 42,612,596 | 46.90% | [redpajama-cc-2020-05-refine.yaml](redpajama-cc-2020-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2020-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2020-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2020-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2021-04 | 98,878,523 | 44,724,752 | 45.23% | [redpajama-cc-2021-04-refine.yaml](redpajama-cc-2021-04-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2021-04-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2021-04-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2021-04-refined-by-data-juicer) | Redpajama | +| Common Crawl 2022-05 | 94,058,868 | 42,648,496 | 45.34% | [redpajama-cc-2022-05-refine.yaml](redpajama-cc-2022-05-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2022-05-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2022-05-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2022-05-refined-by-data-juicer) | Redpajama | +| Common Crawl 2023-06 | 111,402,716 | 50,643,699 | 45.46% | [redpajama-cc-2023-06-refine.yaml](redpajama-cc-2023-06-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-cc-refine-results/redpajama-cc-2023-06-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-cc-2023-06-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-cc-2023-06-refined-by-data-juicer) | Redpajama | +| Github Code | 73,208,524
+ 21,387,703 | 49,279,344 | 52.09% | [redpajama-code-refine.yaml](github_code/redpajama-code-refine.yaml)
[stack-code-refine.yaml](github_code/stack-code-refine.yaml)
[redpajama-stack-code-deduplicate.yaml](github_code/redpajama-stack-code-deduplicate.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-stack-code-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-stack-code-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-stack-code-refined-by-data-juicer) | Redpajama
The Stack | +| StackExchange | 45,447,328 | 26,309,203 | 57.89% | [redpajama-pile-stackexchange-refine.yaml](redpajama-pile-stackexchange-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/redpajama-pile-stackexchange-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/redpajama-pile-stackexchange-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/redpajama-pile-stackexchange-refined-by-data-juicer) | Redpajama
The Pile | +| EuroParl | 69,814 | 61,601 | 88.23% | [pile-europarl-refine.yaml](pile-europarl-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-europarl-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-europarl-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-europarl-refined-by-data-juicer) | The Pile | +| FreeLaw | 3,562,015 | 2,942,612 | 82.61% | [pile-freelaw-refine.yaml](pile-freelaw-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-freelaw-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-freelaw-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-freelaw-refined-by-data-juicer) | The Pile | +| HackerNews | 373,027 | 371,331 | 99.55% | [pile-hackernews-refine.yaml](pile-hackernews-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hackernews-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-hackernews-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-hackernews-refined-by-data-juicer) | The Pile | +| NIH ExPorter | 939,661 | 858,492 | 91.36% | [pile-nih-refine.yaml](pile-nih-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-hin-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-nih-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-nih-refined-by-data-juicer) | The Pile | +| PhilPapers | 32,782 | 29,117 | 88.82% | [pile-philpaper-refine.yaml](pile-philpaper-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-philpaper-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-philpaper-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-philpaper-refined-by-data-juicer) | The Pile | +| PubMed Abstracts | 15,518,009 | 15,009,325 | 96.72% | [pile-pubmed-abstract-refine.yaml](pile-pubmed-abstract-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-abstract-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-abstracts-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-abstracts-refined-by-data-juicer) | The Pile | +| PubMed Central | 3,098,930 | 2,694,860 | 86.96% | [pile-pubmed-central-refine.yaml](pile-pubmed-central-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-pubmed-central-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-pubmed-central-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-pubmed-central-refined-by-data-juicer) | The Pile | +| USPTO | 5,883,024 | 4,516,283 | 76.77% | [pile-uspto-refine.yaml](pile-uspto-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/pretraining/the-pile-uspto-refine-result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/the-pile-uspto-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/the-pile-uspto-refined-by-data-juicer) | The Pile | ## Before and after refining for Alpaca-CoT Dataset -| subset | #samples before | #samples after | keep ratio |data link | source | -|----------------------|:---------------------------:|:--------------:|:----------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary) | [39 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | -| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary) | [28 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | +| subset | #samples before | #samples after | keep ratio | config link | data link | source | +|------------------|:-------------------------:|:--------------------------------------:|:----------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| Alpaca-Cot EN | 136,219,879 | 72,855,345 | 54.48% | [alpaca-cot-en-refine.yaml](alpaca_cot/alpaca-cot-en-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-en-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-en-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-en-refined-by-data-juicer) | [39 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | +| Alpaca-Cot ZH | 21,197,246 | 9,873,214 | 46.58% | [alpaca-cot-zh-refine.yaml](alpaca_cot/alpaca-cot-zh-refine.yaml) | [Aliyun](https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/LLM_data/our_refined_datasets/CFT/alpaca-cot-zh-refine_result.jsonl)
[ModelScope](https://modelscope.cn/datasets/Data-Juicer/alpaca-cot-zh-refined-by-data-juicer/summary)
[HuggingFace](https://huggingface.co/datasets/datajuicer/alpaca-cot-zh-refined-by-data-juicer) | [28 Subsets of Alpaca-CoT](alpaca_cot/README.md#refined-alpaca-cot-dataset-meta-info) | ''' @@ -125,7 +126,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) @@ -190,8 +191,8 @@ def display_tracer_result(op_type, prefix, files): # simhash value may exceed the range of # integer type of streamlit if 'simhash_deduplicator' in filename: - obj['dup1'].pop('simhash') - obj['dup2'].pop('simhash') + obj['dup1'].pop(HashKeys.simhash) + obj['dup2'].pop(HashKeys.simhash) st.dataframe(objs) diff --git a/demos/tool_dataset_splitting_by_language/app.py b/demos/tool_dataset_splitting_by_language/app.py index 0830bfdb3..9907d19cd 100644 --- a/demos/tool_dataset_splitting_by_language/app.py +++ b/demos/tool_dataset_splitting_by_language/app.py @@ -59,7 +59,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) diff --git a/demos/tool_quality_classifier/app.py b/demos/tool_quality_classifier/app.py index 03f7be45d..b118659da 100644 --- a/demos/tool_quality_classifier/app.py +++ b/demos/tool_quality_classifier/app.py @@ -112,7 +112,7 @@ def setup(): st.markdown( f'
A One-Stop Data Processing System for \ Large Language Models, \ - see more details in our page
', + see more details in our Github', unsafe_allow_html=True, ) diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index 723494975..a151ff3ba 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -1,3 +1,4 @@ +fsspec==2023.3.0 pandas==2.0.0 datasets==2.11.0 loguru diff --git a/environments/preprocess_requires.txt b/environments/preprocess_requires.txt index 870fc86b3..657e1936b 100644 --- a/environments/preprocess_requires.txt +++ b/environments/preprocess_requires.txt @@ -1,2 +1,2 @@ fire -jsonlines \ No newline at end of file +jsonlines