configs/data_juicer_recipes/pile-pubmed-abstract-refine.yaml

# global parameters
project_name: 'Data-Juicer-recipes-pubmed-abstract'
dataset_path: '/path/to/your/dataset'  # path to your dataset directory or file
export_path: '/path/to/your/dataset.jsonl'

np: 50  # number of subprocess to process your dataset
open_tracer: true

# process schedule
# a list of several process operators with their arguments
process:
  - clean_email_mapper:
  - clean_links_mapper:
  - fix_unicode_mapper:
  - punctuation_normalization_mapper:
  - whitespace_normalization_mapper:

  - alphanumeric_filter:  # 4068
      tokenization: false
      min_ratio: 0.7  # < 3sigma (0.773)
      max_ratio: 0.881  # 3sigma
  - average_line_length_filter:  # for code
      max_len: 2100  # > 3sigma (1471) -- 7410
  - character_repetition_filter:
      rep_len: 10
      max_ratio: 0.2  # > 3sigma (0.1458) -- 6060
  - flagged_words_filter:
      lang: en
      tokenization: true
      max_ratio: 0.00232  # 3sigma
  - language_id_score_filter:  # remove language filter
      min_score: 0.5
  - maximum_line_length_filter:  # for code
      max_len: 4000  # remove 8202 samples
  - perplexity_filter:
      lang: en
      max_ppl: 4000  # remove 10284 samples
  - special_characters_filter:
      max_ratio: 0.38  # remove 5532 samples
  - text_length_filter:
      max_len: 4000  # > 3sigma -- 10873
  - words_num_filter:
      lang: en
      tokenization: true
      min_num: 20  # remove 10790 samples
      max_num: 700  # remove 22709 samples
  - word_repetition_filter:
      lang: en
      tokenization: true
      rep_len: 10
      max_ratio: 0.0887  # 3sigma

  - document_simhash_deduplicator:
      tokenization: space
      window_size: 3  # small window size for short texts
      lowercase: true
      ignore_pattern: '\p{P}'
      num_blocks: 10
      hamming_distance: 8  # larger hamming distance threshold for short texts