Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
chenyushuo committed Dec 20, 2024
2 parents 9d893be + 2fdf484 commit 024b338
Show file tree
Hide file tree
Showing 130 changed files with 5,616 additions and 1,001 deletions.
3 changes: 3 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ omit =

# avoid measuring code of unittest
tests/*

[report]
ignore_errors = True
5 changes: 4 additions & 1 deletion .github/workflows/deploy_sphinx_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@ on:
jobs:
pages:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [ "3.9", "3.10" ]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@master
with:
python_version: ${{ matrix.python-version }}
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
56 changes: 56 additions & 0 deletions .github/workflows/perf-bench.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: performance_benchmark

on:
workflow_dispatch:
push:
branches:
- main

permissions:
contents: read

env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true

jobs:
perf_bench:
runs-on: [GPU, unittest]
environment: Testing
steps:
- uses: actions/checkout@v3
with:
path: dj-${{ github.run_id }}

- name: Setup docker compose
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose up -d
- name: Install data-juicer
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head pip install -e .\[all\]
- name: Clean dataset cache
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head rm -rf /data/huggingface/dataset
- name: Run performance benchmark standalone
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
- name: Remove docker compose
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
if: always()
run: |
docker compose down --remove-orphans
- name: Cleanup workspace
if: always()
run: |
rm -rf dj-${{ github.run_id }}
15 changes: 8 additions & 7 deletions .github/workflows/publish-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ on:

env:
IMAGE_NAME: datajuicer/data-juicer
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true


jobs:
build:
runs-on: ubuntu-latest
runs-on: [docker]
permissions:
contents: read
packages: write
Expand All @@ -27,7 +28,9 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
path: dj-${{ github.run_id }}

# Install the cosign tool except on PR
# https://github.com/sigstore/cosign-installer
Expand All @@ -40,12 +43,12 @@ jobs:
# multi-platform images and export cache
# https://github.com/docker/setup-buildx-action
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v2

# Login against a Docker registry except on PR
# https://github.com/docker/login-action
- name: Log into Docker Hub
uses: docker/login-action@v3
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
Expand All @@ -64,12 +67,10 @@ jobs:
id: build-and-push
uses: docker/build-push-action@v6
with:
context: .
context: dj-${{ github.run_id }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

# Sign the resulting Docker image digest except on PRs.
# This will only write to the public Rekor transparency log when the Docker
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ env:

jobs:
unittest-single:
runs-on: [self-hosted, linux]
runs-on: [GPU, unittest]
environment: Testing
steps:
- uses: actions/checkout@v3
Expand Down
37 changes: 22 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,32 +1,39 @@
# The data-juicer image includes all open-source contents of data-juicer,
# and it will be instaled in editable mode.

FROM python:3.8.18
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

# install python 3.10
RUN apt-get update \
&& apt-get install -y git curl vim wget python3.10 libpython3.10-dev python3-pip \
&& apt-get install -y libgl1-mesa-glx libglib2.0-0 \
&& ln -sf /usr/bin/python3.10 /usr/bin/python3 \
&& ln -sf /usr/bin/python3.10 /usr/bin/python \
&& apt-get autoclean && rm -rf /var/lib/apt/lists/* \
&& pip install --upgrade pip

# install 3rd-party system dependencies
RUN apt-get update \
&& apt-get install ffmpeg libsm6 libxext6 software-properties-common build-essential cmake gfortran libopenblas-dev liblapack-dev -y

# prepare the java env
WORKDIR /opt
# download jdk
RUN wget https://aka.ms/download-jdk/microsoft-jdk-17.0.9-linux-x64.tar.gz -O jdk.tar.gz && \
tar -xzf jdk.tar.gz && \
rm -rf jdk.tar.gz && \
mv jdk-17.0.9+8 jdk
RUN wget https://aka.ms/download-jdk/microsoft-jdk-17.0.9-linux-x64.tar.gz -O jdk.tar.gz \
&& tar -xzf jdk.tar.gz \
&& rm -rf jdk.tar.gz \
&& mv jdk-17.0.9+8 jdk

# set the environment variable
ENV JAVA_HOME=/opt/jdk

WORKDIR /data-juicer

# install requirements which need to be installed from source
RUN pip install git+https://github.com/xinyu1205/recognize-anything.git --default-timeout 1000

# install requirements first to better reuse installed library cache
COPY environments/ environments/
RUN cat environments/* | xargs pip install --default-timeout 1000
RUN pip install --upgrade setuptools==69.5.1 setuptools_scm \
&& pip install git+https://github.com/xinyu1205/recognize-anything.git --default-timeout 1000

# install data-juicer then
COPY . .
RUN pip install -v -e .[all]
RUN pip install -v -e .[sandbox]

# install 3rd-party system dependencies
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
RUN pip install -v -e .[all] --default-timeout 1000
RUN pip install -v -e .[sandbox] --default-timeout 1000
31 changes: 30 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ Table of Contents

## Prerequisites

- Recommend Python>=3.8,<=3.10
- Recommend Python>=3.9,<=3.10
- gcc >= 5 (at least C++14 support)

## Installation
Expand Down Expand Up @@ -197,6 +197,22 @@ The dependency options are listed below:
| `.[tools]` | Install dependencies for dedicated tools, such as quality classifiers. |
| `.[sandbox]` | Install all dependencies for sandbox. |

- Install dependencies for specific OPs

With the growth of the number of OPs, the dependencies of all OPs becomes very heavy. Instead of using the command `pip install -v -e .[sci]` to install all dependencies,
we provide two alternative, lighter options:

- Automatic Minimal Dependency Installation: During the execution of Data-Juicer, minimal dependencies will be automatically installed. This allows for immediate execution, but may potentially lead to dependency conflicts.

- Manual Minimal Dependency Installation: To manually install minimal dependencies tailored to a specific execution configuration, run the following command:
```shell
# only for installation from source
python tools/dj_install.py --config path_to_your_data-juicer_config_file

# use command line tool
dj-install --config path_to_your_data-juicer_config_file
```

### Using pip

- Run the following command to install the latest released `data_juicer` using `pip`:
Expand Down Expand Up @@ -317,6 +333,11 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml
# use command line tool
dj-analyze --config configs/demo/analyzer.yaml
# you can also use auto mode to avoid writing a recipe. It will analyze a small
# part (e.g. 1000 samples, specified by argument `auto_num`) of your dataset
# with all Filters that produce stats.
dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000]
```
- **Note:** Analyzer only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process.
Expand Down Expand Up @@ -386,6 +407,10 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml
```shell
# run the data processing directly
docker run --rm \ # remove container after the processing
--privileged \
--shm-size 256g \
--network host \
--gpus all \
--name dj \ # name of the container
-v <host_data_path>:<image_data_path> \ # mount data or config directory into the container
-v ~/.cache/:/root/.cache/ \ # mount the cache directory into the container to reuse caches and models (recommended)
Expand All @@ -398,6 +423,10 @@ docker run --rm \ # remove container after the processing
```shell
# start the container
docker run -dit \ # run the container in the background
--privileged \
--shm-size 256g \
--network host \
--gpus all \
--rm \
--name dj \
-v <host_data_path>:<image_data_path> \
Expand Down
29 changes: 28 additions & 1 deletion README_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ Data-Juicer正在积极更新和维护中,我们将定期强化和新增更多

## 前置条件

* 推荐 Python>=3.8,<=3.10
* 推荐 Python>=3.9,<=3.10
* gcc >= 5 (at least C++14 support)

## 安装
Expand Down Expand Up @@ -178,6 +178,21 @@ pip install -v -e .[tools] # 安装部分工具库的依赖
| `.[tools]` | 安装专用工具库(如质量分类器)所需的依赖项 |
| `.[sandbox]` | 安装沙盒实验室的基础依赖 |

* 只安装部分算子依赖

随着OP数量的增长,所有OP的依赖变得很重。为此,我们提供了两个替代的、更轻量的选项,作为使用命令`pip install -v -e .[sci]`安装所有依赖的替代:

* 自动最小依赖安装:在执行Data-Juicer的过程中,将自动安装最小依赖。也就是说你可以直接执行,但这种方式可能会导致一些依赖冲突。

* 手动最小依赖安装:可以通过如下指令手动安装适合特定执行配置的最小依赖:
```shell
# 适用于从源码安装
python tools/dj_install.py --config path_to_your_data-juicer_config_file

# 使用命令行工具
dj-install --config path_to_your_data-juicer_config_file
```

### 使用 pip 安装

* 运行以下命令用 `pip` 安装 `data_juicer` 的最新发布版本:
Expand Down Expand Up @@ -295,6 +310,10 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml
# 使用命令行工具
dj-analyze --config configs/demo/analyzer.yaml
# 你也可以使用"自动"模式来避免写一个新的数据菜谱。它会使用全部可产出统计信息的 Filter 来分析
# 你的数据集的一小部分(如1000条样本,可通过 `auto_num` 参数指定)
dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000]
```

* **注意**:Analyzer 只计算 Filter 算子的状态,其他的算子(例如 Mapper 和 Deduplicator)会在分析过程中被忽略。
Expand Down Expand Up @@ -363,6 +382,10 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml
```shell
# 直接运行数据处理
docker run --rm \ # 在处理结束后将容器移除
--privileged \
--shm-size 256g \
--network host \
--gpus all \
--name dj \ # 容器名称
-v <host_data_path>:<image_data_path> \ # 将本地的数据或者配置目录挂载到容器中
-v ~/.cache/:/root/.cache/ \ # 将 cache 目录挂载到容器以复用 cache 和模型资源(推荐)
Expand All @@ -375,6 +398,10 @@ docker run --rm \ # 在处理结束后将容器移除
```shell
# 启动容器
docker run -dit \ # 在后台启动容器
--privileged \
--shm-size 256g \
--network host \
--gpus all \
--rm \
--name dj \
-v <host_data_path>:<image_data_path> \
Expand Down
Loading

0 comments on commit 024b338

Please sign in to comment.