diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index cda9e5c36..3a5ff0ed0 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -436,12 +436,7 @@ def merge_config(ori_cfg, new_cfg: Dict): try: ori_specified_op_names = set() ori_specified_op_idx = {} # {op_name: op_order} - # format of ori_cfg.process - # ori_cfg.process[i] = { - # op_in_process_name: - # None if internal_op_para is None else - # namespace_to_dict(internal_op_para) - # } + for op_order, op_in_process in enumerate(ori_cfg.process): op_name = list(op_in_process.keys())[0] ori_specified_op_names.add(op_name) @@ -450,13 +445,13 @@ def merge_config(ori_cfg, new_cfg: Dict): for new_k, new_v in new_cfg.items(): # merge parameters other than `cfg.process` and DJ-OPs if new_k in ori_cfg and new_k != 'process' and '.' not in new_k: - print( - '=' * 15, f'\nBefore merging, the cfg item is: ' - f'{new_k}: {ori_cfg[new_k]}') + logger.info('=' * 15) + logger.info(f'Before merging, the cfg item is: ' + f'{new_k}: {ori_cfg[new_k]}') ori_cfg[new_k] = new_v - print( - f'After merging, the cfg item is: ' - f'{new_k}: {new_v}\n', '=' * 15, '\n') + logger.info(f'After merging, the cfg item is: ' + f'{new_k}: {new_v}') + logger.info('=' * 15) else: # merge parameters of DJ-OPs into cfg.process # for nested style, e.g., `remove_table_text_mapper.min_col: 2` @@ -466,13 +461,13 @@ def merge_config(ori_cfg, new_cfg: Dict): op_name, para_name = key_as_groups[0], key_as_groups[1] op_order = ori_specified_op_idx[op_name] ori_cfg_val = ori_cfg.process[op_order][op_name][para_name] - print( - '=' * 15, f'\nBefore merging, the cfg item is: ' - f'{new_k}: {ori_cfg_val}') + logger.info('=' * 15) + logger.info(f'Before merging, the cfg item is: ' + f'{new_k}: {ori_cfg_val}') ori_cfg.process[op_order][op_name][para_name] = new_v - print( - f'After merging, the cfg item is: ' - f'{new_k}: {new_v}\n', '=' * 15, '\n') + logger.info(f'After merging, the cfg item is: ' + f'{new_k}: {new_v}') + logger.info('=' * 15) ori_cfg = init_setup_from_cfg(ori_cfg) diff --git a/tools/hpo/README.md b/tools/hpo/README.md index 3b3e8f126..f7b7108b3 100644 --- a/tools/hpo/README.md +++ b/tools/hpo/README.md @@ -6,7 +6,7 @@ We incorporate an automated HPO tool, WandB [Sweep](https://docs.wandb.ai/guides With this tool, users can investigate correlations and importance scores of specific hyper-parameters of data recipes from the HPO view. -*Note*: this is an experimental feature. Auto-HPO for data recipes still has +**Note**: this is an experimental feature. Auto-HPO for data recipes still has a large room to explore. Feel free to provide more suggestions, discussion, and contribution via new PRs! @@ -14,7 +14,7 @@ and contribution via new PRs! ## Prerequisite You need to install data-juicer first. Besides, the tool leverages WandB, install it via `pip install wandb`. -Before using this tool, you need to run ` +Before using this tool, you need to run ```wandb login``` and enter your WandB API key. If you have your own instance of WandB (e.g., [locally-hosted machine](https://docs.wandb.ai/guides/hosting/)), run the following script: @@ -47,54 +47,5 @@ After running it, you will get the result similar to: ![img](https://img.alicdn. You can implement your own HPO objective in `get_hpo_objective` function, e.g., linking the data recipes to - model_loss (by replacing the quality scorer into a training procedure), -- downstream_task (by eplacing the quality scorer into a training and an# Hyper-parameter Optimization for Data Recipe - -## Auto-HPO - -We incorporate an automated HPO tool, WandB [Sweep](https://docs.wandb.ai/guides/sweeps), into Data-Juicer to streamline the finding of good data processing hyper-parameters. -With this tool, users can investigate correlations and importance scores of -specific hyper-parameters of data recipes from the HPO view. - -*Note*: this is an experimental feature. Auto-HPO for data recipes still has -a large room to explore. Feel free to provide more suggestions, discussions, and contributions via new PRs! - - -## Prerequisite -You need to install data-juicer first. -Besides, the tool leverages WandB, install it via `pip install wandb`. -Before using this tool, you need to run ` -```wandb login``` and enter your WandB -API key. -If you have your own instance of WandB (e.g., [locally-hosted machine](https://docs.wandb.ai/guides/hosting/)), run the following script: - -```shell -wandb login --host -# enter your api key -``` - - - -## Usage and Customization - -Given a data recipe, characterized by a specified configuration file -``, you can use `execute_hpo.py` to search the -hyper-parameter space defined by ``. -```shell -# cd tools/hpo -python execute_hpo.py --config --hpo_config - -# e.g., -python execute_hpo.py --config configs/process.yaml --hpo_config configs/quality_score_hpo.yaml -``` - -We provide an illustrative objective "quality_score" in `hpo/objects.py`, -which uses quality scorer to measure the processed data, and links the average scores to hyper-parameters of data recipes. -After running it, you will get the result similar to: ![img](https://img.alicdn.com/imgextra/i2/O1CN017fT4Al1bVldeuCmiI_!!6000000003471-2-tps-2506-1710.png) - - -You can implement your own HPO objective in `get_hpo_objective` function, e.g., linking the data -recipes to -- model_loss (by replacing the quality scorer with a training procedure), -- downstream_task (by replacing the quality scorer with training and - evaluation procedures), or +- downstream_task (by replacing the quality scorer with training and evaluation procedures), or - some synergy measures that combine metrics you are interested in, such that the trade-offs from different views can be explored. diff --git a/tools/hpo/README_ZH.md b/tools/hpo/README_ZH.md index ab84e7341..7cdb1a407 100644 --- a/tools/hpo/README_ZH.md +++ b/tools/hpo/README_ZH.md @@ -6,7 +6,7 @@ Data-Juicer 中,以简化改良数据处理超参数的过程。 使用此工具,用户可以研究探索 *数据配方的特定超参数* 和 *指定目标度量(如数据质量分、模型loss等)* 之间的 相关性和重要性得分 -*注意*:这是一个实验性功能。 用于数据配方的 Auto-HPO 仍然有 +**注意**:这是一个实验性功能。 用于数据配方的 Auto-HPO 仍然有 一个极大的探索空间,暂无标准做法。 欢迎大家提出更多的建议、讨论、 并通过新的 PR 做出贡献! @@ -39,7 +39,7 @@ python execute_hpo.py --config configs/process.yaml --hpo_config configs/quality ``` 我们在`hpo/objects.py`中提供了一个示意性的搜索目标 `quality_score`, -它使用质量评分器来度量处理后的数据,并将平均质分数链接到数据配方的超参数。 +它使用质量评分器来度量处理后的数据,并将平均质量分数链接到数据配方的超参数。 运行后,你会得到类似如下的结果:![img](https://img.alicdn.com/imgextra/i2/O1CN017fT4Al1bVldeuCmiI_!!6000000003471-2-tps-2506-1710.png) diff --git a/tools/hpo/configs/quality_score_hpo.yaml b/tools/hpo/configs/quality_score_hpo.yaml index 543e7b64b..cedae00ed 100644 --- a/tools/hpo/configs/quality_score_hpo.yaml +++ b/tools/hpo/configs/quality_score_hpo.yaml @@ -22,8 +22,7 @@ parameters: min: 256 max: 8192 - -#early_terminate: -# type: hyperband -# max_iter: 27 -# s: 2 +early_terminate: + type: hyperband + max_iter: 27 + s: 2