From 52eb2d464f6ccd0ddf6bd45ce9cae6e7824d5352 Mon Sep 17 00:00:00 2001 From: abhishekkrthakur Date: Thu, 19 Sep 2024 15:42:21 +0200 Subject: [PATCH] add distributed backend param --- src/autotrain/app/params.py | 1 + src/autotrain/commands.py | 128 +++++++++++++++------------ src/autotrain/trainers/clm/params.py | 1 + 3 files changed, 72 insertions(+), 58 deletions(-) diff --git a/src/autotrain/app/params.py b/src/autotrain/app/params.py index 54e04a2cf0..586df83e34 100644 --- a/src/autotrain/app/params.py +++ b/src/autotrain/app/params.py @@ -81,6 +81,7 @@ padding="right", chat_template="none", max_completion_length=128, + distributed_backend="ddp", ).model_dump() PARAMS["text-classification"] = TextClassificationParams( diff --git a/src/autotrain/commands.py b/src/autotrain/commands.py index 909ca5c453..b6d4836031 100644 --- a/src/autotrain/commands.py +++ b/src/autotrain/commands.py @@ -20,6 +20,75 @@ from autotrain.trainers.vlm.params import VLMTrainingParams +CPU_COMMAND = [ + "accelerate", + "launch", + "--cpu", +] + +SINGLE_GPU_COMMAND = [ + "accelerate", + "launch", + "--num_machines", + "1", + "--num_processes", + "1", +] + + +def get_accelerate_command(num_gpus, gradient_accumulation_steps=1, distributed_backend=None): + """ + Returns the accelerate command based on the number of GPUs available. + + Args: + num_gpus: Number of GPUs available. + gradient_accumulation_steps: Number of gradient accumulation steps. + distributed_backend: Distributed backend to use: ddp, deepspeed, None. + + Returns: + List: Accelerate command. + """ + if num_gpus == 0: + logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") + return CPU_COMMAND + + if num_gpus == 1: + return SINGLE_GPU_COMMAND + + if distributed_backend in ("ddp", None): + return [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + str(num_gpus), + ] + elif distributed_backend == "deepspeed": + return [ + "accelerate", + "launch", + "--use_deepspeed", + "--zero_stage", + "3", + "--offload_optimizer_device", + "none", + "--offload_param_device", + "none", + "--zero3_save_16bit_model", + "true", + "--zero3_init_flag", + "true", + "--deepspeed_multinode_launcher", + "standard", + "--gradient_accumulation_steps", + str(gradient_accumulation_steps), + ] + else: + raise ValueError("Unsupported distributed backend") + + def launch_command(params): """ Launches training command based on the given parameters. @@ -43,64 +112,7 @@ def launch_command(params): else: num_gpus = 0 if isinstance(params, LLMTrainingParams): - if num_gpus == 0: - logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") - cmd = [ - "accelerate", - "launch", - "--cpu", - ] - elif num_gpus == 1: - cmd = [ - "accelerate", - "launch", - "--num_machines", - "1", - "--num_processes", - "1", - ] - elif num_gpus == 2: - cmd = [ - "accelerate", - "launch", - "--multi_gpu", - "--num_machines", - "1", - "--num_processes", - "2", - ] - else: - if params.quantization in ("int8", "int4") and params.peft and params.mixed_precision == "bf16": - cmd = [ - "accelerate", - "launch", - "--multi_gpu", - "--num_machines", - "1", - "--num_processes", - str(num_gpus), - ] - else: - cmd = [ - "accelerate", - "launch", - "--use_deepspeed", - "--zero_stage", - "3", - "--offload_optimizer_device", - "none", - "--offload_param_device", - "none", - "--zero3_save_16bit_model", - "true", - "--zero3_init_flag", - "true", - "--deepspeed_multinode_launcher", - "standard", - "--gradient_accumulation_steps", - str(params.gradient_accumulation), - ] - + cmd = get_accelerate_command(num_gpus, params.gradient_accumulation, params.distributed_backend) if num_gpus > 0: cmd.append("--mixed_precision") if params.mixed_precision == "fp16": diff --git a/src/autotrain/trainers/clm/params.py b/src/autotrain/trainers/clm/params.py index 9b1ba3d6b3..a2362bd68a 100644 --- a/src/autotrain/trainers/clm/params.py +++ b/src/autotrain/trainers/clm/params.py @@ -69,3 +69,4 @@ class LLMTrainingParams(AutoTrainParams): # unsloth unsloth: bool = Field(False, title="Use unsloth") + distributed_backend: Optional[str] = Field(None, title="Distributed backend")