settings/example.json

{
    "input_path": "your_input_data_path (please use absolute path)",
    "input_ext": "jsonl",
    "input_text_key": "text",
    "output_path": "your_output_data_path (please use absolute path)",
    "output_source_value": "the-dataset-name",
    "if_filter": false,
    "filter_paras": {
        "fil_my_rules": {
            "comments": "To make ZHEM more flexible, we have added a new filtering interface that allows users to add custom filter rules to 'utils/utils/my_rules.py'.",
            "use": false,
            "rules": [

            ]
        },
        "fil_dirty_words": {
            "comments": "Remove the text if the words in 'words' appear in the text.", 
            "use": false,
            "words": [
                
            ]
        },
        "fil_langs": {
            "use": false,
            "accept_lang_list": ["en", "zh"]
        },
        "fil_ppl": {
            "use": false,
            "param": 3
        },
        "fil_lang_score": {
            "use": false,
            "reject_threshold": 0.5
        },
        "fil_short_texts": {
            "use" : false,
            "param": 170
        },
        "fil_non_ch": {
            "use": false,
            "param": 0.4
        },
        "fil_alphanum": {
            "use": false,
            "lower_bound": 0.0,
            "upper_bound": 1.0
        },
        "fil_copyright": {
            "use": false,
            "ch_list":[
            ],
            "en_list": [
            ]
        },
        "fil_short_lines": {
            "use": false,
            "param": 0.25,
            "lower_bound": 3
        }
    },
    "if_clean": false,
    "clean_paras": {
        "extractor": {
            "use": false,
            "mode": {"html": false, "mobi": false, "epub": false},
            "keep_newline_labels": [
                
            ]
        },
        "my_funcs": {
            "comments": "To make ZHEM more flexible, we have added a new cleaning interface that allows users to add custom cleaning functions to 'utils/utils/my_funcs.py'.",
            "use": false,
            "funcs": [
                "for_bdbk"
            ]
        },
        "rm_crawlerchars": {
            "use": false
        },
        "sub_newline": {
            "use": false
        },
        "rm_re_rules": {
            "comments": "remove text fragments matching any of regular expressions in `re_list`",
            "use": false,
            "re_list": [
            ]
        },
        "sub_re_rules": {
            "comments": "substitute(replace) text fragments by any <key, value> pair of `re_dict`, key is an regular expression, value is an plain text",
            "use": false,
            "re_dict": {
                
            }
        },
        "rm_str_rules": {
            "comments": "remove text fragments matching any of strings in `str_list`",
            "use": false,
            "str_list": [
            ]
        },
        "rm_re_lines": {
            "comments": "remove **a line** if any text fragments in this line matching any of **regular expressions** in `re_list`",
            "use": false,
            "re_list": [

            ]
        },
        "rm_str_lines": {
            "comments": "remove **a line** if any text fragments in this line matching any of **string** in `str_list`",
            "use": false,
            "str_list": [

            ]
        },
        "rm_str_seg": {
            "comments": "remove **a segment** after matching any of **string** in `str_list` if any text fragments in this text are matched",
            "use": false,
            "str_list": [
            ]
        },
        "rm_re_seg": {
            "comments": "remove **a segment** after matching any of **string** in `re_list` if any text fragments in this text are matched",
            "use": false,
            "re_list": [
            ]
        },
        "tra2sim": {
            "use": false,
            "target": "zh-cn"
        },
        "rm_pii": {
            "comments": "remove personally identifiable information",
            "use": true,
            "details": {
                "email": false,
                "idcard": true,
                "ip": false,
                "phone": false,
                "url": false
            }
        },
        "dedup_line_ngram": {
            "use": false,
            "n": 5,
            "thre_sim": 0.95
        }
    },
    "if_dedup": false,
    "dedup_paras": {
        "language": "zh",
        "ngram": 5,
        "num_perm": 128,
        "min_length": 20,
        "threshold": 0.7
    },
    "if_parallel": false,
    "parallel_paras": {
        "comments": "it means if the input data is already cut into pieces",
        "if_cut": false,
        "cut_tmpfiles_path": "",
        "n_process": 10,
        "chunk_size": -1
    },
    "if_debug": false,
    "debug_paras": {
        "debug_report_path": "debug_report/debug_report.json",
        "debug_cases_num": 5,
        "if_sample": true,
        "debug_sample_num_per_file": 100,  
        "debug_find_cases": {
            "use" : false,
            "words": [
                
            ]
        },
        "debug_short_texts": {
            "length": 200,
            "if_fix_fil_ratio": false,
            "exp_fil_ratio": 0.05
        },
        "debug_non_ch": {
            "use": false,
            "if_fix_fil_ratio": false,
            "exp_fil_ratio": 0.05
        },
        "debug_short_lines": {
            "use": false,
            "if_fix_fil_ratio": false,
            "exp_fil_ratio": 0.05
        },
        "debug_ppl": {
            "use": false
        }
    }
}