diff --git a/llm/alignment/dpo/dpo_argument.py b/llm/alignment/dpo/dpo_argument.py index c9552a36260a..df44ff47dcfd 100644 --- a/llm/alignment/dpo/dpo_argument.py +++ b/llm/alignment/dpo/dpo_argument.py @@ -18,16 +18,7 @@ from paddlenlp.trainer import TrainingArguments from paddlenlp.trainer.trainer_utils import IntervalStrategy - - -def add_start_docstrings(*docstr): - """Adds docstrings for a function.""" - - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass diff --git a/llm/alignment/rm/flashmask/reward_argument.py b/llm/alignment/rm/flashmask/reward_argument.py index 7776bfda4bf4..06097a6d7cdc 100644 --- a/llm/alignment/rm/flashmask/reward_argument.py +++ b/llm/alignment/rm/flashmask/reward_argument.py @@ -17,16 +17,7 @@ from typing import Optional from paddlenlp.trainer import TrainingArguments - - -def add_start_docstrings(*docstr): - """Adds docstrings for a function.""" - - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass diff --git a/llm/auto_parallel/gpt-3/run_pretrain_auto.py b/llm/auto_parallel/gpt-3/run_pretrain_auto.py index 16fb05dae15b..00e44d10a244 100644 --- a/llm/auto_parallel/gpt-3/run_pretrain_auto.py +++ b/llm/auto_parallel/gpt-3/run_pretrain_auto.py @@ -52,14 +52,7 @@ check_data_split, print_rank_0, ) - - -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass diff --git a/llm/auto_parallel/llama/run_pretrain_auto.py b/llm/auto_parallel/llama/run_pretrain_auto.py index 8d6ff5462807..14b078bc19d9 100644 --- a/llm/auto_parallel/llama/run_pretrain_auto.py +++ b/llm/auto_parallel/llama/run_pretrain_auto.py @@ -55,14 +55,7 @@ check_data_split, print_rank_0, ) - - -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass diff --git a/llm/auto_parallel/qwen/run_pretrain_3D_auto.py b/llm/auto_parallel/qwen/run_pretrain_3D_auto.py index e1e0eae25491..a476f68cc5c9 100644 --- a/llm/auto_parallel/qwen/run_pretrain_3D_auto.py +++ b/llm/auto_parallel/qwen/run_pretrain_3D_auto.py @@ -53,14 +53,7 @@ check_data_split, print_rank_0, ) - - -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass diff --git a/llm/experimental/ernie-3.5-se/run_pretrain.py b/llm/experimental/ernie-3.5-se/run_pretrain.py index 6faefcedac5a..971a3ecadaa6 100644 --- a/llm/experimental/ernie-3.5-se/run_pretrain.py +++ b/llm/experimental/ernie-3.5-se/run_pretrain.py @@ -49,12 +49,7 @@ } -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass diff --git a/llm/run_finetune.py b/llm/run_finetune.py index 1ac3e384da09..e18f25bb6cc2 100644 --- a/llm/run_finetune.py +++ b/llm/run_finetune.py @@ -19,13 +19,7 @@ from functools import partial import paddle -from utils.argument import ( - DataArgument, - GenerateArgument, - ModelArgument, - ReftArgument, - TrainingArguments, -) +from utils.argument import GenerateArgument, ReftArgument from utils.data import convert_example_for_reft, get_convert_example from paddlenlp.data import DataCollatorForSeq2Seq @@ -68,7 +62,7 @@ ) from paddlenlp.transformers.configuration_utils import LlmMetaConfig from paddlenlp.transformers.refined_recompute import update_refined_recompute -from paddlenlp.trl import SFTTrainer +from paddlenlp.trl import DataConfig, ModelConfig, SFTConfig, SFTTrainer from paddlenlp.trl.llm_utils import ( ZeroPaddingIterDatasetCallback, compute_metrics, @@ -86,7 +80,7 @@ def main(): - parser = PdArgumentParser((GenerateArgument, ModelArgument, ReftArgument, DataArgument, TrainingArguments)) + parser = PdArgumentParser((GenerateArgument, ModelConfig, ReftArgument, DataConfig, SFTConfig)) if len(sys.argv) >= 2 and sys.argv[1].endswith(".json"): gen_args, model_args, reft_args, data_args, training_args = parser.parse_json_file_and_cmd_lines() else: @@ -230,12 +224,14 @@ def neft_post_hook(module, input, output): neft_post_hook_handle = model.get_input_embeddings().register_forward_post_hook(neft_post_hook) else: raise NotImplementedError("Only support neftune for model with get_input_embeddings") + if training_args.sequence_parallel: register_sequence_parallel_allreduce_hooks( model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce ) # Load tokenizer & dataset tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, from_aistudio=model_args.from_aistudio) + reft_layers = None if model_args.reft: # reft requires padding side right tokenizer.padding_side = "right" @@ -244,7 +240,8 @@ def neft_post_hook(module, input, output): layers = [int(l) for l in layers.split(";")] else: layers = [l for l in range(model_config.num_hidden_layers)] - logging.info("Using ReFT with layers: ", layers) + reft_layers = layers + logging.info("Using ReFT with layers: ", reft_layers) # init chat_template for tokenizer init_chat_template(tokenizer, model_args.model_name_or_path, data_args.chat_template) @@ -255,59 +252,7 @@ def neft_post_hook(module, input, output): if isinstance(tokenizer, LlamaTokenizer) or isinstance(tokenizer, Llama3Tokenizer): tokenizer.pad_token_id = tokenizer.eos_token_id - if data_args.dataset_name_or_path is None: - raise ValueError(f"Please specific dataset name or path (got {data_args.dataset_name_or_path})") - elif os.path.exists(os.path.join(data_args.dataset_name_or_path, "train.json")) or os.path.exists( - os.path.join(data_args.dataset_name_or_path, "dev.json") - ): - if training_args.do_train: - train_ds = load_dataset( - "json", - data_files=os.path.join(data_args.dataset_name_or_path, "train.json"), - lazy=data_args.lazy, - )[0] - else: - train_ds = None - if training_args.do_eval: - dev_ds = load_dataset( - "json", - data_files=os.path.join(data_args.dataset_name_or_path, "dev.json"), - lazy=data_args.lazy, - )[0] - else: - dev_ds = None - - elif os.path.exists(os.path.join(data_args.dataset_name_or_path, "train")) or os.path.exists( - os.path.join(data_args.dataset_name_or_path, "dev") - ): - import glob - - if training_args.do_train: - train_ds = load_dataset( - "json", - data_files=glob.glob(os.path.join(data_args.dataset_name_or_path, "train", "*.json")), - lazy=data_args.lazy, - )[0] - else: - train_ds = None - if training_args.do_eval: - dev_ds = load_dataset( - "json", - data_files=glob.glob(os.path.join(data_args.dataset_name_or_path, "dev", "*.json")), - lazy=data_args.lazy, - )[0] - else: - dev_ds = None - - else: - if training_args.do_train: - train_ds = load_dataset(data_args.dataset_name_or_path, splits=["train"])[0] - else: - train_ds = None - if training_args.do_eval: - dev_ds = load_dataset(data_args.dataset_name_or_path, splits=["dev"])[0] - else: - dev_ds = None + train_ds, dev_ds, test_ds = create_dataset(data_args, training_args) # TODO(ZHUI & sijunhe): Temporary implementation. Generalize this logic and move to Trainer later. if training_args.resume_from_checkpoint is not None and data_args.lazy: @@ -340,65 +285,185 @@ def neft_post_hook(module, input, output): tokenizer=tokenizer, data_args=data_args, positions=reft_args.position, - num_interventions=len(layers), + num_interventions=len(reft_layers), ) else: trans_func = partial(get_convert_example(model), tokenizer=tokenizer, data_args=data_args) - train_ds = ( - train_ds.map( - partial(trans_func, is_test=False, zero_padding=data_args.zero_padding, flash_mask=model_args.flash_mask) - ) - if train_ds is not None - else None - ) - eval_zero_padding = data_args.zero_padding if data_args.zero_padding and data_args.eval_with_do_generation: logger.warning( "`zero_padding` conflicts with `eval_with_do_generation`. Setting zero_padding to False for the eval_dataset." ) eval_zero_padding = False - dev_ds = ( - dev_ds.map( - partial( - trans_func, - is_test=data_args.eval_with_do_generation, - zero_padding=eval_zero_padding, - flash_mask=model_args.flash_mask, - ) - ) - if dev_ds is not None - else None + + train_ds, dev_ds, test_ds = trans_dataset_to_ids( + train_ds, dev_ds, test_ds, model_args, data_args, trans_func, eval_zero_padding ) + if data_args.zero_padding: if data_args.lazy: intoken_dataset = ZeroPaddingIterableDataset else: intoken_dataset = ZeroPaddingMapDataset logger.info("Creating Zero Padding Data Stream. This may take a few minutes.") - train_ds = ( - intoken_dataset( + if train_ds is not None: + train_ds = intoken_dataset( train_ds, tokenizer=tokenizer, max_length=data_args.max_length, greedy_zero_padding=data_args.greedy_zero_padding, ) - if train_ds is not None - else None - ) + if eval_zero_padding and dev_ds is not None: + dev_ds = intoken_dataset(dev_ds, tokenizer=tokenizer, max_length=data_args.max_length) + if eval_zero_padding and test_ds is not None: + test_ds = intoken_dataset(test_ds, tokenizer=tokenizer, max_length=data_args.max_length) + + model = create_peft_model(model_args, reft_args, training_args, dtype, model_config, model, reft_layers) + + def compute_metrics_do_generation(eval_preds): + rouge1 = Rouge1() + rouge2 = Rouge2() + rougel = RougeL() + bleu4 = BLEU(n_size=4) + + predictions = [x[x != -100].tolist() for x in eval_preds.predictions] + references = [x[x != -100].tolist() for x in eval_preds.label_ids] + + predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True, clean_up_tokenization_spaces=False) + references = tokenizer.batch_decode(references, skip_special_tokens=True, clean_up_tokenization_spaces=False) + if data_args.save_generation_output: + with open(os.path.join(training_args.output_dir, "generated_output.json"), "w", encoding="utf-8") as f: + for pred, ref in zip(predictions, references): + out = {"output": pred, "tgt": ref} + f.write(json.dumps(out, ensure_ascii=False) + "\n") - if eval_zero_padding: - dev_ds = ( - intoken_dataset( - dev_ds, - tokenizer=tokenizer, - max_length=data_args.max_length, - ) - if dev_ds is not None - else None + # for pred in predictions: + rouge1_score = rouge1.score(predictions, references) + rouge2_score = rouge2.score(predictions, references) + for pred, ref in zip(predictions, references): + rougel.add_inst(pred, [ref]) + bleu4.add_inst(pred, [ref]) + return { + "rouge1": rouge1_score, + "rouge2": rouge2_score, + "rougel": rougel.score(), + "bleu4": bleu4.score(), + } + + # Create trainer + + if ( + training_args.pipeline_parallel_degree > 1 + or training_args.sequence_parallel + or training_args.autotuner_benchmark + or data_args.zero_padding + or data_args.pad_to_max_length + ): + # NOTE(gongenlei): new add autotuner_benchmark + max_length = data_args.max_length + padding = "max_length" + else: + max_length = None + padding = True + + if training_args.pipeline_parallel_degree > 1: + metrics = None + elif data_args.eval_with_do_generation: + metrics = compute_metrics_do_generation + else: + metrics = compute_metrics + + data_collator_fn = DataCollatorForSeq2Seq( + tokenizer=tokenizer, + max_length=max_length, + padding=padding, + max_label_length=max_length, + return_tensors="np", + return_attention_mask=not model_args.flash_mask, + pad_to_multiple_of=data_args.pad_to_multiple_of, + ) + trainer = SFTTrainer( + model=model, + args=training_args, + train_dataset=train_ds, + eval_dataset=dev_ds, + tokenizer=tokenizer, + compute_metrics=metrics, + data_collator=data_collator_fn if not model_args.reft else ReftDataCollator(data_collator=data_collator_fn), + do_generation=data_args.eval_with_do_generation, + callbacks=[ZeroPaddingIterDatasetCallback()] if isinstance(train_ds, ZeroPaddingIterableDataset) else None, + gen_args=gen_args, + data_args=data_args, + ) + trainable_parameters = [p for p in model.parameters() if not p.stop_gradient] + trainer.set_optimizer_grouped_parameters(trainable_parameters) + + # Train + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + if model_args.neftune: + neft_post_hook_handle.remove() + if training_args.benchmark: + total_effective_tokens = ( + sum([len(i["input_ids"]) for i in trainer.train_dataset]) * train_result.metrics["progress_or_epoch"] ) + effective_tokens_per_second = total_effective_tokens / train_result.metrics["train_runtime"] + logger.info(f"Effective_Tokens_per_second: {effective_tokens_per_second} ") + logger.info("Benchmark done.") + else: + if model_args.save_to_aistudio: + save_to_aistudio(model_args, training_args, trainer) + + if not training_args.autotuner_benchmark: + trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1) + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + + # Evaluation test set + if training_args.do_predict: + eval_result = trainer.predict(test_ds).metrics + trainer.log_metrics("test", eval_result) + + # Evaluation dev set + if training_args.do_eval: + logger.info("*** Evaluate result after train ***") + eval_result = trainer.evaluate(dev_ds) + trainer.log_metrics("eval", eval_result) + + +def save_to_aistudio(model_args, training_args, trainer): + kwargs = {} + if model_args.aistudio_token is not None: + kwargs["token"] = model_args.aistudio_token + # PEFT Model only save PEFT parameters, if pretrained model obtains from aistudio + if model_args.from_aistudio and (model_args.lora or model_args.prefix_tuning): + kwargs["base_model"] = model_args.model_name_or_path + else: + trainer.tokenizer.save_to_aistudio( + repo_id=model_args.aistudio_repo_id, + private=model_args.aistudio_repo_private, + license=model_args.aistudio_repo_license, + exist_ok=True, + **kwargs, + ) + trainer.model.save_to_aistudio( + repo_id=model_args.aistudio_repo_id, + private=model_args.aistudio_repo_private, + license=model_args.aistudio_repo_license, + merge_tensor_parallel=training_args.tensor_parallel_degree > 1, + exist_ok=True, + **kwargs, + ) + +def create_peft_model(model_args, reft_args, training_args, dtype, model_config, model, reft_layers): if model_args.prefix_tuning: if training_args.pipeline_parallel_degree > 1: raise NotImplementedError("Prefix tuning is not implemented for pipeline parallelism.") @@ -466,8 +531,6 @@ def neft_post_hook(module, input, output): else: model = LoKrModel.from_pretrained(model=model, lokr_path=model_args.lokr_path) - # For debugging purpose, you can print the model to see which layer is transformed into a lokr layer - # print(model) if model_args.reft: intervention_dtype = dtype intervention_params = { @@ -486,7 +549,7 @@ def neft_post_hook(module, input, output): "low_rank_dimension": reft_args.rank, "intervention": intervention_mapping[reft_args.intervention_type](**intervention_params), } - for l in layers + for l in reft_layers ] reft_config = ReFTConfig( representations=representations, intervention_params=intervention_params, position=reft_args.position @@ -497,36 +560,6 @@ def neft_post_hook(module, input, output): model.disable_model_gradients() model.print_trainable_parameters() - def compute_metrics_do_generation(eval_preds): - rouge1 = Rouge1() - rouge2 = Rouge2() - rougel = RougeL() - bleu4 = BLEU(n_size=4) - - predictions = [x[x != -100].tolist() for x in eval_preds.predictions] - references = [x[x != -100].tolist() for x in eval_preds.label_ids] - - predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True, clean_up_tokenization_spaces=False) - references = tokenizer.batch_decode(references, skip_special_tokens=True, clean_up_tokenization_spaces=False) - if data_args.save_generation_output: - with open(os.path.join(training_args.output_dir, "generated_output.json"), "w", encoding="utf-8") as f: - for pred, ref in zip(predictions, references): - out = {"output": pred, "tgt": ref} - f.write(json.dumps(out, ensure_ascii=False) + "\n") - - # for pred in predictions: - rouge1_score = rouge1.score(predictions, references) - rouge2_score = rouge2.score(predictions, references) - for pred, ref in zip(predictions, references): - rougel.add_inst(pred, [ref]) - bleu4.add_inst(pred, [ref]) - return { - "rouge1": rouge1_score, - "rouge2": rouge2_score, - "rougel": rougel.score(), - "bleu4": bleu4.score(), - } - if model_args.vera: target_modules = get_lora_target_modules(model) vera_config = VeRAConfig( @@ -541,125 +574,92 @@ def compute_metrics_do_generation(eval_preds): model.mark_only_vera_as_trainable(notfreezeB=True) model.print_trainable_parameters() - # Create trainer + return model - if ( - training_args.pipeline_parallel_degree > 1 - or training_args.sequence_parallel - or training_args.autotuner_benchmark - or data_args.zero_padding - or data_args.pad_to_max_length - ): - # NOTE(gongenlei): new add autotuner_benchmark - max_length = data_args.max_length - padding = "max_length" - else: - max_length = None - padding = True - if training_args.pipeline_parallel_degree > 1: - metrics = None - elif data_args.eval_with_do_generation: - metrics = compute_metrics_do_generation - else: - metrics = compute_metrics +def trans_dataset_to_ids(train_ds, dev_ds, test_ds, model_args, data_args, trans_func, eval_zero_padding): + if train_ds is not None: + train_ds = train_ds.map( + partial(trans_func, is_test=False, zero_padding=data_args.zero_padding, flash_mask=model_args.flash_mask) + ) + if dev_ds is not None: + dev_ds = dev_ds.map( + partial( + trans_func, + is_test=data_args.eval_with_do_generation, + zero_padding=eval_zero_padding, + flash_mask=model_args.flash_mask, + ) + ) + if test_ds is not None: + test_ds = test_ds.map(partial(trans_func, is_test=data_args.eval_with_do_generation)) - data_collator_fn = DataCollatorForSeq2Seq( - tokenizer=tokenizer, - max_length=max_length, - padding=padding, - max_label_length=max_length, - return_tensors="np", - return_attention_mask=not model_args.flash_mask, - pad_to_multiple_of=data_args.pad_to_multiple_of, - ) - trainer = SFTTrainer( - model=model, - args=training_args, - train_dataset=train_ds, - eval_dataset=dev_ds, - tokenizer=tokenizer, - compute_metrics=metrics, - data_collator=data_collator_fn if not model_args.reft else ReftDataCollator(data_collator=data_collator_fn), - do_generation=data_args.eval_with_do_generation, - callbacks=[ZeroPaddingIterDatasetCallback()] if isinstance(train_ds, ZeroPaddingIterableDataset) else None, - gen_args=gen_args, - data_args=data_args, - ) - trainable_parameters = [p for p in model.parameters() if not p.stop_gradient] - trainer.set_optimizer_grouped_parameters(trainable_parameters) + return train_ds, dev_ds, test_ds - # Train - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - if model_args.neftune: - neft_post_hook_handle.remove() - if training_args.benchmark: - total_effective_tokens = ( - sum([len(i["input_ids"]) for i in trainer.train_dataset]) * train_result.metrics["progress_or_epoch"] - ) - effective_tokens_per_second = total_effective_tokens / train_result.metrics["train_runtime"] - logger.info(f"Effective_Tokens_per_second: {effective_tokens_per_second} ") - logger.info("Benchmark done.") - else: - if model_args.save_to_aistudio: - kwargs = {} - if model_args.aistudio_token is not None: - kwargs["token"] = model_args.aistudio_token - # PEFT Model only save PEFT parameters, if pretrained model obtains from aistudio - if model_args.from_aistudio and (model_args.lora or model_args.prefix_tuning): - kwargs["base_model"] = model_args.model_name_or_path - else: - trainer.tokenizer.save_to_aistudio( - repo_id=model_args.aistudio_repo_id, - private=model_args.aistudio_repo_private, - license=model_args.aistudio_repo_license, - exist_ok=True, - **kwargs, - ) - trainer.model.save_to_aistudio( - repo_id=model_args.aistudio_repo_id, - private=model_args.aistudio_repo_private, - license=model_args.aistudio_repo_license, - merge_tensor_parallel=training_args.tensor_parallel_degree > 1, - exist_ok=True, - **kwargs, - ) - if not training_args.autotuner_benchmark: - trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1) - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() +def create_dataset(data_args, training_args): + if data_args.dataset_name_or_path is None: + raise ValueError(f"Please specific dataset name or path (got {data_args.dataset_name_or_path})") - # Evaluation test set - if training_args.do_predict: - test_ds = load_dataset( - "json", - data_files=os.path.join(data_args.dataset_name_or_path, "test.json"), - lazy=data_args.lazy, - )[0] + train_ds = None + dev_ds = None + test_ds = None + if os.path.exists(os.path.join(data_args.dataset_name_or_path, "train.json")) or os.path.exists( + os.path.join(data_args.dataset_name_or_path, "dev.json") + ): + if training_args.do_train: + train_ds = load_dataset( + "json", + data_files=os.path.join(data_args.dataset_name_or_path, "train.json"), + lazy=data_args.lazy, + )[0] + if training_args.do_eval: + dev_ds = load_dataset( + "json", + data_files=os.path.join(data_args.dataset_name_or_path, "dev.json"), + lazy=data_args.lazy, + )[0] + if training_args.do_predict: + test_ds = load_dataset( + "json", + data_files=os.path.join(data_args.dataset_name_or_path, "test.json"), + lazy=data_args.lazy, + )[0] - test_ds = test_ds.map(partial(trans_func, is_test=data_args.eval_with_do_generation)) - if eval_zero_padding: - test_ds = intoken_dataset( - test_ds, - tokenizer=tokenizer, - max_length=data_args.max_length, - ) - eval_result = trainer.predict(test_ds).metrics - trainer.log_metrics("test", eval_result) + elif os.path.exists(os.path.join(data_args.dataset_name_or_path, "train")) or os.path.exists( + os.path.join(data_args.dataset_name_or_path, "dev") + ): + import glob - # Evaluation dev set - if training_args.do_eval: - logger.info("*** Evaluate result after train ***") - eval_result = trainer.evaluate(dev_ds) - trainer.log_metrics("eval", eval_result) + if training_args.do_train: + train_ds = load_dataset( + "json", + data_files=glob.glob(os.path.join(data_args.dataset_name_or_path, "train", "*.json")), + lazy=data_args.lazy, + )[0] + if training_args.do_eval: + dev_ds = load_dataset( + "json", + data_files=glob.glob(os.path.join(data_args.dataset_name_or_path, "dev", "*.json")), + lazy=data_args.lazy, + )[0] + if training_args.do_predict: + test_ds = load_dataset( + "json", + data_files=glob.glob(os.path.join(data_args.dataset_name_or_path, "test", "*.json")), + lazy=data_args.lazy, + )[0] + else: + if training_args.do_train: + train_ds = load_dataset(data_args.dataset_name_or_path, splits=["train"])[0] + + if training_args.do_eval: + dev_ds = load_dataset(data_args.dataset_name_or_path, splits=["dev"])[0] + + if training_args.do_predict: + test_ds = load_dataset(data_args.dataset_name_or_path, splits=["test"])[0] + + return train_ds, dev_ds, test_ds if __name__ == "__main__": diff --git a/llm/run_pretrain.py b/llm/run_pretrain.py index 76cdbdbcb7ac..18436f015e54 100644 --- a/llm/run_pretrain.py +++ b/llm/run_pretrain.py @@ -53,12 +53,7 @@ os.environ["USE_CASUAL_MASK"] = "True" -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass @@ -87,13 +82,9 @@ class PreTrainingArguments(TrainingArguments): metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."}, ) unified_checkpoint: bool = field( - default=False, + default=True, metadata={"help": "Enable fused linear grad add strategy."}, ) - unified_checkpoint_config: Optional[str] = field( - default="", - metadata={"help": "Configs to unify hybrid parallel checkpoint.\n"}, - ) def __post_init__(self): super().__post_init__() @@ -111,6 +102,7 @@ def __post_init__(self): self.report_to = [] self.save_strategy = IntervalStrategy.NO self.evaluation_strategy = IntervalStrategy.NO + self.unified_checkpoint = False @dataclass diff --git a/llm/run_quantization.py b/llm/run_quantization.py index e4f36d11fb88..232c0595ca9c 100644 --- a/llm/run_quantization.py +++ b/llm/run_quantization.py @@ -17,13 +17,7 @@ from functools import partial import paddle -from utils.argument import ( - DataArgument, - GenerateArgument, - ModelArgument, - QuantArgument, - TrainingArguments, -) +from utils.argument import GenerateArgument from utils.data import get_convert_example from paddlenlp.data import DataCollatorForSeq2Seq @@ -50,7 +44,7 @@ register_sequence_parallel_allreduce_hooks, ) from paddlenlp.transformers.configuration_utils import LlmMetaConfig -from paddlenlp.trl import SFTTrainer +from paddlenlp.trl import DataConfig, ModelConfig, QuantConfig, SFTConfig, SFTTrainer from paddlenlp.trl.llm_utils import ( ZeroPaddingIterDatasetCallback, compute_metrics, @@ -66,7 +60,7 @@ def main(): - parser = PdArgumentParser((GenerateArgument, QuantArgument, ModelArgument, DataArgument, TrainingArguments)) + parser = PdArgumentParser((GenerateArgument, QuantConfig, ModelConfig, DataConfig, SFTConfig)) if len(sys.argv) >= 2 and sys.argv[1].endswith(".json"): gen_args, quant_args, model_args, data_args, training_args = parser.parse_json_file_and_cmd_lines() else: diff --git a/llm/utils/argument.py b/llm/utils/argument.py index e425198d9c91..812293f1ab8f 100644 --- a/llm/utils/argument.py +++ b/llm/utils/argument.py @@ -12,249 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass, field -from typing import List, Optional - -from paddlenlp.trainer import TrainingArguments -from paddlenlp.trainer.trainer_utils import IntervalStrategy -from paddlenlp.transformers.configuration_utils import llmmetaclass -from paddlenlp.utils.log import logger - - -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator - - -@dataclass -@llmmetaclass -@add_start_docstrings(TrainingArguments.__doc__) -class TrainingArguments(TrainingArguments): - benchmark: bool = field(default=False, metadata={"help": "Whether runs benchmark"}) - # NOTE(gongenlei): new add autotuner_benchmark - autotuner_benchmark: bool = field( - default=False, - metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."}, - ) - decay_steps: int = field( - default=0, - metadata={"help": "The steps use to control the learing rate."}, - ) - tensor_parallel_output: Optional[bool] = field( - default=False, - metadata={"help": "whether to output logits in distributed status"}, - ) - unified_checkpoint: bool = field( - default=False, - metadata={"help": "Unify hybrid parallel checkpoint."}, - ) - unified_checkpoint_config: Optional[str] = field( - default="", - metadata={"help": "Configs to unify hybrid parallel checkpoint.\n"}, - ) - - def __post_init__(self): - super().__post_init__() - # NOTE(gongenlei): new add autotuner_benchmark - if self.autotuner_benchmark: - self.max_steps = 5 - self.do_train = True - self.do_export = False - self.do_predict = False - self.do_eval = False - self.overwrite_output_dir = True - self.load_best_model_at_end = False - self.report_to = [] - self.save_strategy = IntervalStrategy.NO - self.evaluation_strategy = IntervalStrategy.NO - if self.benchmark: - self.do_train = True - self.do_export = False - self.do_predict = False - self.do_eval = False - self.overwrite_output_dir = True - self.load_best_model_at_end = False - self.report_to = [] - self.save_strategy = IntervalStrategy.NO - self.evaluation_strategy = IntervalStrategy.NO - - -@dataclass -class DataArgument: - dataset_name_or_path: str = field(default=None, metadata={"help": "Name or path for dataset"}) - task_name: str = field(default=None, metadata={"help": "Additional name to select a more specific task."}) - zero_padding: bool = field(default=False, metadata={"help": "Whether to use Zero Padding data stream"}) - greedy_zero_padding: bool = field( - default=False, - metadata={ - "help": "Whether to use Greedy Zero Padding data stream, should be used together with `zero_padding=True`." - }, - ) - pad_to_multiple_of: int = field( - default=None, metadata={"help": "If set will pad the sequence to a multiple of the provided value."} - ) - src_length: int = field(default=1024, metadata={"help": "The maximum length of source(context) tokens."}) - max_length: int = field( - default=2048, - metadata={ - "help": "The maximum length that model input tokens can have. When Zero Padding is set to True, it's also the maximum length for Zero Padding data stream" - }, - ) - eval_with_do_generation: bool = field(default=False, metadata={"help": "Whether to do generation for evaluation"}) - save_generation_output: bool = field( - default=False, - metadata={"help": "Whether to save generated text to file when eval_with_do_generation set to True."}, - ) - lazy: bool = field( - default=False, - metadata={ - "help": "Weather to return `MapDataset` or an `IterDataset`.True for `IterDataset`. False for `MapDataset`." - }, - ) - chat_template: str = field( - default=None, - metadata={ - "help": "the path of `chat_template.json` file to handle multi-rounds conversation. If is None, it will not use `chat_template.json`; If is equal with `model_name_or_path`, it will use the default loading; If is directory, it will find the `chat_template.json` under the directory; If is file, it will load it." - }, - ) - # NOTE(gongenlei): deprecated params - task_name_or_path: str = field( - default=None, - metadata={ - "help": "@deprecated Please use `dataset_name_or_path`. Name or path for dataset, same as `dataset_name_or_path`." - }, - ) # Alias for dataset_name_or_path - intokens: bool = field( - default=None, - metadata={ - "help": "@deprecated Please use `zero_padding`. Whether to use InTokens data stream, same as `zero_padding`." - }, - ) # Alias for zero_padding - pad_to_max_length: bool = field( - default=False, - metadata={"help": "Pad the input sequence to `max_length`."}, - ) - autoregressive: bool = field( - default=False, - metadata={"help": "Whether to use autoregressive mode."}, - ) - # Pose ralated parameters - use_pose_convert: bool = field(default=False, metadata={"help": "Whether to use PoSE data conversion function"}) - - def __post_init__(self): - if self.task_name_or_path is not None: - logger.warning("`--task_name_or_path` is deprecated, please use `--dataset_name_or_path`.") - self.dataset_name_or_path = self.task_name_or_path - - if self.intokens is not None: - logger.warning("`--intokens` is deprecated, please use `--zero_padding`.") - self.zero_padding = self.intokens - - -@dataclass -class ModelArgument: - model_name_or_path: str = field( - default=None, metadata={"help": "Build-in pretrained model name or the path to local model."} - ) - tokenizer_name_or_path: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - use_fast_layer_norm: bool = field( - default=False, - metadata={"help": "GPT3 model, use fast layernorm"}, - ) - fuse_attention_qkv: bool = field( - default=None, - metadata={"help": "whether to fuse attention qkv"}, - ) - fuse_attention_ffn: bool = field( - default=None, - metadata={"help": "whether to fuse first up and gate proj in mlp block"}, - ) - hidden_dropout_prob: float = field(default=0.1, metadata={"help": "The hidden dropout prob."}) - attention_probs_dropout_prob: float = field(default=0.1, metadata={"help": "The attention hidden dropout prob."}) - - continue_training: bool = field( - default=True, - metadata={ - "help": "Whether to train from existing paddlenlp model weights. If set True, the model_name_or_path argument must exist in the paddlenlp models." - }, - ) - weight_quantize_algo: str = field( - default=None, - metadata={ - "help": "Model weight quantization algorithm including 'nf4', 'fp4','weight_only_int4', 'weight_only_int8'." - }, - ) - weight_blocksize: int = field( - default=64, - metadata={"help": "Block size for weight quantization(Only available for nf4 or fp4 quant_scale.)."}, - ) - weight_double_quant: bool = field( - default=False, metadata={"help": "Whether apply double quant(Only available for nf4 or fp4 quant_scale.)."} - ) - weight_double_quant_block_size: int = field( - default=256, - metadata={ - "help": "Block size for quant_scale of weight quant_scale(Only available for nf4 or fp4 quant_scale.)" - }, - ) - - # LoRA related parameters - lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"}) - lora_path: str = field(default=None, metadata={"help": "Initialize lora state dict."}) - lora_rank: int = field(default=8, metadata={"help": "Lora attention dimension"}) - use_quick_lora: bool = field( - default=False, - metadata={ - "help": "Whether to use quick lora, The use of Quick LoRa will only take effect when lora_dropout is set to 0." - }, - ) - rslora: bool = field(default=False, metadata={"help": "Whether to use RsLoRA"}) - lora_plus_scale: float = field(default=1.0, metadata={"help": "Lora B scale in LoRA+ technique"}) - pissa: bool = field(default=False, metadata={"help": "Whether to use Pissa: https://arxiv.org/pdf/2404.02948.pdf"}) - lora_use_mixer: bool = field( - default=False, metadata={"help": "Whether to use MosLoRA: https://arxiv.org/pdf/2406.11909"} - ) - - # vera related parameters - vera: bool = field(default=False, metadata={"help": "Whether to use vera technique"}) - vera_rank: int = field(default=8, metadata={"help": "Vera attention dimension"}) - - # lokr related parameter - lokr: bool = field(default=False, metadata={"help": "Whether to use LoKr technique"}) - lokr_path: str = field( - default=None, metadata={"help": "Initialize lokr state dict and apply customized lokr config"} - ) - lokr_dim: int = field(default=8, metadata={"help": "Lora dimention in LoKr dimension for adapter matrix"}) - - # prefix tuning related parameters - prefix_tuning: bool = field(default=False, metadata={"help": "Whether to use Prefix technique"}) - prefix_path: str = field(default=None, metadata={"help": "Initialize prefix state dict."}) - num_prefix_tokens: int = field(default=128, metadata={"help": "Number of prefix tokens"}) - - # reft related parameter - reft: bool = field(default=False, metadata={"help": "Whether using reft method"}) - - from_aistudio: bool = field(default=False, metadata={"help": "Whether to load model from aistudio"}) - save_to_aistudio: bool = field(default=False, metadata={"help": "Whether to save model to aistudio"}) - aistudio_repo_id: str = field(default=None, metadata={"help": "The id of aistudio repo"}) - aistudio_repo_private: bool = field(default=True, metadata={"help": "Whether to create a private repo"}) - aistudio_repo_license: str = field(default="Apache License 2.0", metadata={"help": "The license of aistudio repo"}) - aistudio_token: str = field(default=None, metadata={"help": "The token of aistudio"}) - neftune: bool = field(default=False, metadata={"help": "Whether to apply NEFT"}) - neftune_noise_alpha: float = field(default=5.0, metadata={"help": "NEFT noise alpha"}) - flash_mask: bool = field(default=False, metadata={"help": "Whether to use flash_mask in flash attention."}) - - # long sequence strategy - use_long_sequence_strategies: bool = field( - default=False, metadata={"help": "Whether to use long sequence strategy"} - ) - rope_scaling_factor: float = field(default=1.0, metadata={"help": "Rope extension scaling factor"}) - strategy_type: str = field(default=None, metadata={"help": "Long sequence strategy type"}) - strategy_name: str = field(default=None, metadata={"help": "Long sequence strategy name"}) @dataclass @@ -268,106 +25,6 @@ class ReftArgument: dropout: float = field(default=0.0, metadata={"help": "Dropout rate."}) -@dataclass -class QuantArgument: - - # Quantization method config - quant_type: str = field( - default="a8w8", - metadata={"help": "Quantization type. Supported values: weight_only_int8, weight_only_int4, a8w8, a8w8c8"}, - ) - - fp8_type: List[str] = field( - default_factory=lambda: ["e4m3", "e4m3"], - metadata={"help": "Quantization type for (activation, weight)", "nargs": "+"}, - ) - - skip_list_names: List[str] = field( - default=lambda: [], metadata={"help": "Skip scales for quantization", "nargs": "+"} - ) - - weight_quant_method: str = field( - default="abs_max_channel_wise", - metadata={"help": "Weight quantization method, choosen from ['abs_max_channel_wise', 'groupwise']"}, - ) - - act_quant_method: str = field( - default="avg", - metadata={"help": "Activation quantization method, choosen from ['abs_max', 'avg']"}, - ) - - cachekv_quant_method: str = field( - default="avg_headwise", - metadata={"help": "KV quantization method, choosen from ['abs_max_headwise', 'avg_headwise']"}, - ) - - # Piecewise Search Smooth related parameters - search_alpha_min: float = field( - default=0.2, - metadata={"help": "The minimum alpha for piece search"}, - ) - - search_alpha_max: float = field( - default=0.8, - metadata={"help": "The maximum alpha for piece search"}, - ) - - search_scale_min: float = field( - default=1.0, - metadata={"help": "The minimum scale for piece search"}, - ) - - search_scale_max: float = field( - default=5.0, - metadata={"help": "The maximum scale for piece search"}, - ) - - # QAT related parameters - # Not Yet support - do_qat: bool = field(default=False, metadata={"help": "Whether to use QAT technique"}) - - # PTQ related parameters - do_ptq: bool = field(default=False, metadata={"help": "Whether to use PTQ"}) - ptq_step: int = field(default=32, metadata={"help": "Step for PTQ"}) - - # Pre-quant method Shift related parameters - shift: bool = field(default=False, metadata={"help": "Whether to use Shift"}) - shift_all_linears: bool = field(default=False, metadata={"help": "Whether to shift all linears"}) - shift_sampler: str = field( - default="ema", metadata={"help": "The name of shift sampler, choosen from ['ema', 'none']"} - ) - shift_step: int = field(default=32, metadata={"help": "Sample steps when shift"}) - - # Pre-quant methods Smooth related parameters - smooth: bool = field(default=False, metadata={"help": "Whether to use Smooth"}) - smooth_all_linears: bool = field(default=False, metadata={"help": "Whether to smooth all linears"}) - smooth_sampler: str = field( - default="none", metadata={"help": "The name of smooth sampler, choosen from ['multi_step','none']"} - ) - smooth_step: int = field(default=32, metadata={"help": "Sample steps when smooth"}) - smooth_piecewise_search: bool = field( - default=False, metadata={"help": "The number of piece in piecewise search for smooth strategy."} - ) - smooth_k_piece: int = field(default=3, metadata={"help": "Number of pieces for K-search"}) - smooth_search_piece: bool = field(default=False, metadata={"help": "Whether search k_piece when piecewise search"}) - - # GPTQ related parameters - do_gptq: bool = field(default=False, metadata={"help": "Whether to use GPTQ"}) - gptq_step: int = field(default=8, metadata={"help": "Step for GPTQ"}) - - # AWQ related parameters, default for WINT4 - do_awq: bool = field(default=False, metadata={"help": "Whether to use AWQ Search"}) - auto_clip: bool = field(default=False, metadata={"help": "Whether to use AutoClip from AWQ"}) - awq_step: int = field(default=8, metadata={"help": "Step for AWQ Search"}) - autoclip_step: int = field(default=8, metadata={"help": "Step for AutoClip"}) - - # Other config - load_quant_model: bool = field(default=False, metadata={"help": "Whether to load quant model"}) - - do_quant_debug: bool = field(default=False, metadata={"help": "Whether to use debug"}) - test_sample: Optional[str] = field(default=None, metadata={"help": "Test sample for quantization"}) - - @dataclass class GenerateArgument: top_k: int = field( diff --git a/paddlenlp/datasets/dataset.py b/paddlenlp/datasets/dataset.py index 7bf1b836828c..cf810a5196fc 100644 --- a/paddlenlp/datasets/dataset.py +++ b/paddlenlp/datasets/dataset.py @@ -113,7 +113,10 @@ def load_from_hf(path, name=None, splits=None, **kwargs): from datasets.features import ClassLabel try: - hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs) + if "split" in kwargs: + hf_datasets = load_hf_dataset(path, name=name, **kwargs) + else: + hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs) except FileNotFoundError: raise FileNotFoundError("Couldn't find the dataset script for '" + path + "' on PaddleNLP or HuggingFace") else: diff --git a/paddlenlp/trl/__init__.py b/paddlenlp/trl/__init__.py index a67fd0e69f6d..dcaff1fe023a 100644 --- a/paddlenlp/trl/__init__.py +++ b/paddlenlp/trl/__init__.py @@ -16,6 +16,10 @@ from .dpo_trainer import DPOTrainer from .kto_criterion import KTOCriterion from .kto_trainer import KTOTrainer +from .model_config import * +from .quant_config import * +from .sft_config import * from .sft_trainer import * +from .sftdata_config import * from .trl_data import * from .trl_utils import * diff --git a/paddlenlp/trl/model_config.py b/paddlenlp/trl/model_config.py new file mode 100644 index 000000000000..9e0058cff0b2 --- /dev/null +++ b/paddlenlp/trl/model_config.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import Optional + +__all__ = ["ModelConfig"] + + +@dataclass +class ModelConfig: + model_name_or_path: str = field( + default=None, metadata={"help": "Build-in pretrained model name or the path to local model."} + ) + tokenizer_name_or_path: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + use_fast_layer_norm: bool = field( + default=False, + metadata={"help": "GPT3 model, use fast layernorm"}, + ) + fuse_attention_qkv: bool = field( + default=None, + metadata={"help": "whether to fuse attention qkv"}, + ) + fuse_attention_ffn: bool = field( + default=None, + metadata={"help": "whether to fuse first up and gate proj in mlp block"}, + ) + hidden_dropout_prob: float = field(default=0.1, metadata={"help": "The hidden dropout prob."}) + attention_probs_dropout_prob: float = field(default=0.1, metadata={"help": "The attention hidden dropout prob."}) + + continue_training: bool = field( + default=True, + metadata={ + "help": "Whether to train from existing paddlenlp model weights. If set True, the model_name_or_path argument must exist in the paddlenlp models." + }, + ) + weight_quantize_algo: str = field( + default=None, + metadata={ + "help": "Model weight quantization algorithm including 'nf4', 'fp4','weight_only_int4', 'weight_only_int8'." + }, + ) + weight_blocksize: int = field( + default=64, + metadata={"help": "Block size for weight quantization(Only available for nf4 or fp4 quant_scale.)."}, + ) + weight_double_quant: bool = field( + default=False, metadata={"help": "Whether apply double quant(Only available for nf4 or fp4 quant_scale.)."} + ) + weight_double_quant_block_size: int = field( + default=256, + metadata={ + "help": "Block size for quant_scale of weight quant_scale(Only available for nf4 or fp4 quant_scale.)" + }, + ) + + # LoRA related parameters + lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"}) + lora_path: str = field(default=None, metadata={"help": "Initialize lora state dict."}) + lora_rank: int = field(default=8, metadata={"help": "Lora attention dimension"}) + use_quick_lora: bool = field( + default=False, + metadata={ + "help": "Whether to use quick lora, The use of Quick LoRa will only take effect when lora_dropout is set to 0." + }, + ) + rslora: bool = field(default=False, metadata={"help": "Whether to use RsLoRA"}) + lora_plus_scale: float = field(default=1.0, metadata={"help": "Lora B scale in LoRA+ technique"}) + pissa: bool = field(default=False, metadata={"help": "Whether to use Pissa: https://arxiv.org/pdf/2404.02948.pdf"}) + lora_use_mixer: bool = field( + default=False, metadata={"help": "Whether to use MosLoRA: https://arxiv.org/pdf/2406.11909"} + ) + + # vera related parameters + vera: bool = field(default=False, metadata={"help": "Whether to use vera technique"}) + vera_rank: int = field(default=8, metadata={"help": "Vera attention dimension"}) + + # lokr related parameter + lokr: bool = field(default=False, metadata={"help": "Whether to use LoKr technique"}) + lokr_path: str = field( + default=None, metadata={"help": "Initialize lokr state dict and apply customized lokr config"} + ) + lokr_dim: int = field(default=8, metadata={"help": "Lora dimention in LoKr dimension for adapter matrix"}) + + # prefix tuning related parameters + prefix_tuning: bool = field(default=False, metadata={"help": "Whether to use Prefix technique"}) + prefix_path: str = field(default=None, metadata={"help": "Initialize prefix state dict."}) + num_prefix_tokens: int = field(default=128, metadata={"help": "Number of prefix tokens"}) + + # reft related parameter + reft: bool = field(default=False, metadata={"help": "Whether using reft method"}) + + from_aistudio: bool = field(default=False, metadata={"help": "Whether to load model from aistudio"}) + save_to_aistudio: bool = field(default=False, metadata={"help": "Whether to save model to aistudio"}) + aistudio_repo_id: str = field(default=None, metadata={"help": "The id of aistudio repo"}) + aistudio_repo_private: bool = field(default=True, metadata={"help": "Whether to create a private repo"}) + aistudio_repo_license: str = field(default="Apache License 2.0", metadata={"help": "The license of aistudio repo"}) + aistudio_token: str = field(default=None, metadata={"help": "The token of aistudio"}) + neftune: bool = field(default=False, metadata={"help": "Whether to apply NEFT"}) + neftune_noise_alpha: float = field(default=5.0, metadata={"help": "NEFT noise alpha"}) + flash_mask: bool = field(default=False, metadata={"help": "Whether to use flash_mask in flash attention."}) + + # long sequence strategy + use_long_sequence_strategies: bool = field( + default=False, metadata={"help": "Whether to use long sequence strategy"} + ) + rope_scaling_factor: float = field(default=1.0, metadata={"help": "Rope extension scaling factor"}) + strategy_type: str = field(default=None, metadata={"help": "Long sequence strategy type"}) + strategy_name: str = field(default=None, metadata={"help": "Long sequence strategy name"}) diff --git a/paddlenlp/trl/quant_config.py b/paddlenlp/trl/quant_config.py new file mode 100644 index 000000000000..9c78e2a9ba93 --- /dev/null +++ b/paddlenlp/trl/quant_config.py @@ -0,0 +1,118 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import List, Optional + +__all__ = ["QuantConfig"] + + +@dataclass +class QuantConfig: + + # Quantization method config + quant_type: str = field( + default="a8w8", + metadata={"help": "Quantization type. Supported values: weight_only_int8, weight_only_int4, a8w8, a8w8c8"}, + ) + + fp8_type: List[str] = field( + default_factory=lambda: ["e4m3", "e4m3"], + metadata={"help": "Quantization type for (activation, weight)", "nargs": "+"}, + ) + + skip_list_names: List[str] = field( + default=lambda: [], metadata={"help": "Skip scales for quantization", "nargs": "+"} + ) + + weight_quant_method: str = field( + default="abs_max_channel_wise", + metadata={"help": "Weight quantization method, choosen from ['abs_max_channel_wise', 'groupwise']"}, + ) + + act_quant_method: str = field( + default="avg", + metadata={"help": "Activation quantization method, choosen from ['abs_max', 'avg']"}, + ) + + cachekv_quant_method: str = field( + default="avg_headwise", + metadata={"help": "KV quantization method, choosen from ['abs_max_headwise', 'avg_headwise']"}, + ) + + # Piecewise Search Smooth related parameters + search_alpha_min: float = field( + default=0.2, + metadata={"help": "The minimum alpha for piece search"}, + ) + + search_alpha_max: float = field( + default=0.8, + metadata={"help": "The maximum alpha for piece search"}, + ) + + search_scale_min: float = field( + default=1.0, + metadata={"help": "The minimum scale for piece search"}, + ) + + search_scale_max: float = field( + default=5.0, + metadata={"help": "The maximum scale for piece search"}, + ) + + # QAT related parameters + # Not Yet support + do_qat: bool = field(default=False, metadata={"help": "Whether to use QAT technique"}) + + # PTQ related parameters + do_ptq: bool = field(default=False, metadata={"help": "Whether to use PTQ"}) + ptq_step: int = field(default=32, metadata={"help": "Step for PTQ"}) + + # Pre-quant method Shift related parameters + shift: bool = field(default=False, metadata={"help": "Whether to use Shift"}) + shift_all_linears: bool = field(default=False, metadata={"help": "Whether to shift all linears"}) + shift_sampler: str = field( + default="ema", metadata={"help": "The name of shift sampler, choosen from ['ema', 'none']"} + ) + shift_step: int = field(default=32, metadata={"help": "Sample steps when shift"}) + + # Pre-quant methods Smooth related parameters + smooth: bool = field(default=False, metadata={"help": "Whether to use Smooth"}) + smooth_all_linears: bool = field(default=False, metadata={"help": "Whether to smooth all linears"}) + smooth_sampler: str = field( + default="none", metadata={"help": "The name of smooth sampler, choosen from ['multi_step','none']"} + ) + smooth_step: int = field(default=32, metadata={"help": "Sample steps when smooth"}) + smooth_piecewise_search: bool = field( + default=False, metadata={"help": "The number of piece in piecewise search for smooth strategy."} + ) + smooth_k_piece: int = field(default=3, metadata={"help": "Number of pieces for K-search"}) + smooth_search_piece: bool = field(default=False, metadata={"help": "Whether search k_piece when piecewise search"}) + + # GPTQ related parameters + do_gptq: bool = field(default=False, metadata={"help": "Whether to use GPTQ"}) + gptq_step: int = field(default=8, metadata={"help": "Step for GPTQ"}) + + # AWQ related parameters, default for WINT4 + do_awq: bool = field(default=False, metadata={"help": "Whether to use AWQ Search"}) + auto_clip: bool = field(default=False, metadata={"help": "Whether to use AutoClip from AWQ"}) + awq_step: int = field(default=8, metadata={"help": "Step for AWQ Search"}) + autoclip_step: int = field(default=8, metadata={"help": "Step for AutoClip"}) + + # Other config + load_quant_model: bool = field(default=False, metadata={"help": "Whether to load quant model"}) + + do_quant_debug: bool = field(default=False, metadata={"help": "Whether to use debug"}) + test_sample: Optional[str] = field(default=None, metadata={"help": "Test sample for quantization"}) diff --git a/paddlenlp/trl/sft_config.py b/paddlenlp/trl/sft_config.py new file mode 100644 index 000000000000..56315e71ecf7 --- /dev/null +++ b/paddlenlp/trl/sft_config.py @@ -0,0 +1,76 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import Optional + +from paddlenlp.trainer import TrainingArguments +from paddlenlp.trainer.trainer_utils import IntervalStrategy +from paddlenlp.trainer.utils.doc import add_start_docstrings +from paddlenlp.transformers.configuration_utils import llmmetaclass + +__all__ = ["SFTConfig"] + + +@dataclass +@llmmetaclass +@add_start_docstrings(TrainingArguments.__doc__) +class SFTConfig(TrainingArguments): + benchmark: bool = field(default=False, metadata={"help": "Whether runs benchmark"}) + # NOTE(gongenlei): new add autotuner_benchmark + autotuner_benchmark: bool = field( + default=False, + metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."}, + ) + decay_steps: int = field( + default=0, + metadata={"help": "The steps use to control the learing rate."}, + ) + tensor_parallel_output: Optional[bool] = field( + default=False, + metadata={"help": "whether to output logits in distributed status"}, + ) + unified_checkpoint: bool = field( + default=False, + metadata={"help": "Unify hybrid parallel checkpoint."}, + ) + unified_checkpoint_config: Optional[str] = field( + default="", + metadata={"help": "Configs to unify hybrid parallel checkpoint.\n"}, + ) + + def __post_init__(self): + super().__post_init__() + # NOTE(gongenlei): new add autotuner_benchmark + if self.autotuner_benchmark: + self.max_steps = 5 + self.do_train = True + self.do_export = False + self.do_predict = False + self.do_eval = False + self.overwrite_output_dir = True + self.load_best_model_at_end = False + self.report_to = [] + self.save_strategy = IntervalStrategy.NO + self.evaluation_strategy = IntervalStrategy.NO + if self.benchmark: + self.do_train = True + self.do_export = False + self.do_predict = False + self.do_eval = False + self.overwrite_output_dir = True + self.load_best_model_at_end = False + self.report_to = [] + self.save_strategy = IntervalStrategy.NO + self.evaluation_strategy = IntervalStrategy.NO diff --git a/paddlenlp/trl/sftdata_config.py b/paddlenlp/trl/sftdata_config.py new file mode 100644 index 000000000000..2dc9d2f13568 --- /dev/null +++ b/paddlenlp/trl/sftdata_config.py @@ -0,0 +1,68 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + +__all__ = ["DataConfig"] + + +@dataclass +class DataConfig: + + dataset_name_or_path: str = field(default=None, metadata={"help": "Name or path for dataset"}) + task_name: str = field(default=None, metadata={"help": "Additional name to select a more specific task."}) + zero_padding: bool = field(default=False, metadata={"help": "Whether to use Zero Padding data stream"}) + greedy_zero_padding: bool = field( + default=False, + metadata={ + "help": "Whether to use Greedy Zero Padding data stream, should be used together with `zero_padding=True`." + }, + ) + pad_to_multiple_of: int = field( + default=None, metadata={"help": "If set will pad the sequence to a multiple of the provided value."} + ) + src_length: int = field(default=1024, metadata={"help": "The maximum length of source(context) tokens."}) + max_length: int = field( + default=2048, + metadata={ + "help": "The maximum length that model input tokens can have. When Zero Padding is set to True, it's also the maximum length for Zero Padding data stream" + }, + ) + eval_with_do_generation: bool = field(default=False, metadata={"help": "Whether to do generation for evaluation"}) + save_generation_output: bool = field( + default=False, + metadata={"help": "Whether to save generated text to file when eval_with_do_generation set to True."}, + ) + lazy: bool = field( + default=False, + metadata={ + "help": "Weather to return `MapDataset` or an `IterDataset`.True for `IterDataset`. False for `MapDataset`." + }, + ) + chat_template: str = field( + default=None, + metadata={ + "help": "the path of `chat_template.json` file to handle multi-rounds conversation. If is None, it will not use `chat_template.json`; If is equal with `model_name_or_path`, it will use the default loading; If is directory, it will find the `chat_template.json` under the directory; If is file, it will load it." + }, + ) + pad_to_max_length: bool = field( + default=False, + metadata={"help": "Pad the input sequence to `max_length`."}, + ) + autoregressive: bool = field( + default=False, + metadata={"help": "Whether to use autoregressive mode."}, + ) + # Pose ralated parameters + use_pose_convert: bool = field(default=False, metadata={"help": "Whether to use PoSE data conversion function"}) diff --git a/paddlenlp/trl/utils.py b/paddlenlp/trl/utils.py new file mode 100644 index 000000000000..1d4408b2adb0 --- /dev/null +++ b/paddlenlp/trl/utils.py @@ -0,0 +1,43 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class ScriptArguments: + """ + Arguments common to all scripts. + + dataset_name (`str`): + Dataset name. + dataset_train_split (`str`, *optional*, defaults to `"train"`): + Dataset split to use for training. + dataset_test_split (`str`, *optional*, defaults to `"test"`): + Dataset split to use for evaluation. + config (`str` or `None`, *optional*, defaults to `None`): + Path to the optional config file. + gradient_checkpointing_use_reentrant (`bool`, *optional*, defaults to `False`): + Whether to apply `use_reentrant` for gradient_checkpointing. + ignore_bias_buffers (`bool`, *optional*, defaults to `False`): + Debug argument for distributed training. Fix for DDP issues with LM bias/mask buffers - invalid scalar type, + inplace operation. See https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992. + """ + + dataset_name: str + dataset_train_split: str = "train" + dataset_test_split: str = "test" + config: Optional[str] = None + ignore_bias_buffers: bool = False diff --git a/slm/model_zoo/ernie-1.0/run_pretrain_trainer.py b/slm/model_zoo/ernie-1.0/run_pretrain_trainer.py index 1d54eee4ffaf..fc23ea00e8c1 100644 --- a/slm/model_zoo/ernie-1.0/run_pretrain_trainer.py +++ b/slm/model_zoo/ernie-1.0/run_pretrain_trainer.py @@ -53,13 +53,7 @@ ), } - -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass diff --git a/slm/model_zoo/t5/t5_run_pretrain_trainer.py b/slm/model_zoo/t5/t5_run_pretrain_trainer.py index 01e2abfa5c0d..7f48e20bc21f 100755 --- a/slm/model_zoo/t5/t5_run_pretrain_trainer.py +++ b/slm/model_zoo/t5/t5_run_pretrain_trainer.py @@ -49,12 +49,7 @@ } -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator +from paddlenlp.trainer.utils.doc import add_start_docstrings @dataclass