From eaaf8a26f6acaf354d98e2dcd56c28e34d3a162b Mon Sep 17 00:00:00 2001 From: Abhishek Thakur Date: Fri, 3 May 2024 14:06:29 +0200 Subject: [PATCH] add configs --- configs/dreambooth/sd15_colab.yml | 24 ++++++ configs/dreambooth/sdxl_colab.yml | 25 ++++++ configs/image_classification/hub_dataset.yml | 27 ++++++ configs/llm_finetuning/gpt2_sft.yml | 2 +- configs/llm_finetuning/llama3-8b-orpo.yml | 2 +- .../{imdb_bert.yml => hub_dataset.yml} | 2 +- configs/text_classification/local_dataset.yml | 28 +++++++ configs/text_regression/hub_dataset.yml | 28 +++++++ configs/text_regression/local_dataset.yml | 28 +++++++ configs/token_classification/hub_dataset.yml | 28 +++++++ .../token_classification/local_dataset.yml | 28 +++++++ src/autotrain/__init__.py | 2 +- src/autotrain/cli/autotrain.py | 4 +- src/autotrain/cli/utils.py | 10 +-- src/autotrain/{configparser.py => parser.py} | 84 ++++++++++++------- 15 files changed, 282 insertions(+), 40 deletions(-) create mode 100644 configs/dreambooth/sd15_colab.yml create mode 100644 configs/dreambooth/sdxl_colab.yml create mode 100644 configs/image_classification/hub_dataset.yml rename configs/text_classification/{imdb_bert.yml => hub_dataset.yml} (92%) create mode 100644 configs/text_classification/local_dataset.yml create mode 100644 configs/text_regression/hub_dataset.yml create mode 100644 configs/text_regression/local_dataset.yml create mode 100644 configs/token_classification/hub_dataset.yml create mode 100644 configs/token_classification/local_dataset.yml rename src/autotrain/{configparser.py => parser.py} (61%) diff --git a/configs/dreambooth/sd15_colab.yml b/configs/dreambooth/sd15_colab.yml new file mode 100644 index 0000000000..378458e27f --- /dev/null +++ b/configs/dreambooth/sd15_colab.yml @@ -0,0 +1,24 @@ +task: dreambooth +base_model: runwayml/stable-diffusion-v1-5 +project_name: autotrain-sd15-finetuned +backend: local-cli + +data: + path: data/ # store all images in this folder + prompt: photo of sks person # prompt for the model + +params: + resolution: 512 + batch_size: 1 + num_steps: 500 + lr: 1e-4 + gradient_accumulation: 4 + mixed_precision: fp16 + train_text_encoder: false + xformers: false + use_8bit_adam: false + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/dreambooth/sdxl_colab.yml b/configs/dreambooth/sdxl_colab.yml new file mode 100644 index 0000000000..31344eae32 --- /dev/null +++ b/configs/dreambooth/sdxl_colab.yml @@ -0,0 +1,25 @@ +task: dreambooth +base_model: stabilityai/stable-diffusion-xl-base-1.0 +project_name: autotrain-sdxl-finetuned +backend: local-cli + +data: + path: data/ # store all images in this folder + prompt: photo of sks person # prompt for the model + +params: + resolution: 1024 + batch_size: 1 + num_steps: 500 + lr: 1e-4 + gradient_accumulation: 4 + mixed_precision: fp16 + train_text_encoder: false + xformers: false + use_8bit_adam: false + xl: true + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/image_classification/hub_dataset.yml b/configs/image_classification/hub_dataset.yml new file mode 100644 index 0000000000..693dc5e818 --- /dev/null +++ b/configs/image_classification/hub_dataset.yml @@ -0,0 +1,27 @@ +task: image_classification +base_model: google/vit-base-patch16-224 +project_name: autotrain-cats-vs-dogs-finetuned +log: tensorboard +backend: local-cli + +data: + path: cats_vs_dogs + train_split: train + valid_split: null + column_mapping: + image_column: image + target_column: labels + +params: + epochs: 2 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/llm_finetuning/gpt2_sft.yml b/configs/llm_finetuning/gpt2_sft.yml index 7527d47b50..19eb2d6739 100644 --- a/configs/llm_finetuning/gpt2_sft.yml +++ b/configs/llm_finetuning/gpt2_sft.yml @@ -1,4 +1,4 @@ -task: lm_training +task: llm base_model: openai-community/gpt2 project_name: autotrain-gpt2-finetuned-guanaco log: tensorboard diff --git a/configs/llm_finetuning/llama3-8b-orpo.yml b/configs/llm_finetuning/llama3-8b-orpo.yml index 40fa6078a1..c159cc8b93 100644 --- a/configs/llm_finetuning/llama3-8b-orpo.yml +++ b/configs/llm_finetuning/llama3-8b-orpo.yml @@ -1,4 +1,4 @@ -task: lm_training +task: llm base_model: meta-llama/Meta-Llama-3-8B-Instruct project_name: llama3-8b-orpo log: tensorboard diff --git a/configs/text_classification/imdb_bert.yml b/configs/text_classification/hub_dataset.yml similarity index 92% rename from configs/text_classification/imdb_bert.yml rename to configs/text_classification/hub_dataset.yml index 87f62d24d4..20dfaa6574 100644 --- a/configs/text_classification/imdb_bert.yml +++ b/configs/text_classification/hub_dataset.yml @@ -1,4 +1,4 @@ -task: text_multi_class_classification +task: text_classification base_model: google-bert/bert-base-uncased project_name: autotrain-bert-imdb-finetuned log: tensorboard diff --git a/configs/text_classification/local_dataset.yml b/configs/text_classification/local_dataset.yml new file mode 100644 index 0000000000..53434b1ef8 --- /dev/null +++ b/configs/text_classification/local_dataset.yml @@ -0,0 +1,28 @@ +task: text_classification +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-imdb-finetuned +log: tensorboard +backend: local-cli + +data: + path: data/ # this must be the path to the directory containing the train and valid files + train_split: train # this must be either train.csv or train.json + valid_split: valid # this must be either valid.csv or valid.json + column_mapping: + text_column: text # this must be the name of the column containing the text + target_column: label # this must be the name of the column containing the target + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/text_regression/hub_dataset.yml b/configs/text_regression/hub_dataset.yml new file mode 100644 index 0000000000..2d9b709cdf --- /dev/null +++ b/configs/text_regression/hub_dataset.yml @@ -0,0 +1,28 @@ +task: text_regression +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-sms-spam-finetuned +log: tensorboard +backend: local-cli + +data: + path: sms_spam + train_split: train + valid_split: null + column_mapping: + text_column: sms + target_column: label + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/text_regression/local_dataset.yml b/configs/text_regression/local_dataset.yml new file mode 100644 index 0000000000..e9e2262387 --- /dev/null +++ b/configs/text_regression/local_dataset.yml @@ -0,0 +1,28 @@ +task: text_regression +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-custom-finetuned +log: tensorboard +backend: local-cli + +data: + path: data/ # this must be the path to the directory containing the train and valid files + train_split: train # this must be either train.csv or train.json + valid_split: valid # this must be either valid.csv or valid.json + column_mapping: + text_column: text # this must be the name of the column containing the text + target_column: label # this must be the name of the column containing the target + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/token_classification/hub_dataset.yml b/configs/token_classification/hub_dataset.yml new file mode 100644 index 0000000000..4f38a62a1a --- /dev/null +++ b/configs/token_classification/hub_dataset.yml @@ -0,0 +1,28 @@ +task: token_classification +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-conll2003-finetuned +log: tensorboard +backend: local-cli + +data: + path: conll2003 + train_split: train + valid_split: validation + column_mapping: + tokens_column: tokens + tags_column: ner_tags + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/token_classification/local_dataset.yml b/configs/token_classification/local_dataset.yml new file mode 100644 index 0000000000..e6ca556df3 --- /dev/null +++ b/configs/token_classification/local_dataset.yml @@ -0,0 +1,28 @@ +task: token_classification +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-custom-finetuned +log: tensorboard +backend: local-cli + +data: + path: data/ # this must be the path to the directory containing the train and valid files + train_split: train # this must be either train.json + valid_split: valid # this must be either valid.json, can also be set to null + column_mapping: + text_column: text # this must be the name of the column containing the text + target_column: label # this must be the name of the column containing the target + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/src/autotrain/__init__.py b/src/autotrain/__init__.py index da21dd2ddb..9d87d15519 100644 --- a/src/autotrain/__init__.py +++ b/src/autotrain/__init__.py @@ -41,4 +41,4 @@ logger = Logger().get_logger() -__version__ = "0.7.77.dev0" +__version__ = "0.7.78.dev0" diff --git a/src/autotrain/cli/autotrain.py b/src/autotrain/cli/autotrain.py index 1c2bce378d..0eb76dea34 100644 --- a/src/autotrain/cli/autotrain.py +++ b/src/autotrain/cli/autotrain.py @@ -14,7 +14,7 @@ from autotrain.cli.run_text_regression import RunAutoTrainTextRegressionCommand from autotrain.cli.run_token_classification import RunAutoTrainTokenClassificationCommand from autotrain.cli.run_tools import RunAutoTrainToolsCommand -from autotrain.configparser import ConfigParser +from autotrain.parser import AutoTrainConfigParser def main(): @@ -50,7 +50,7 @@ def main(): if args.config: logger.info(f"Using AutoTrain configuration: {args.config}") - cp = ConfigParser(args.config) + cp = AutoTrainConfigParser(args.config) cp.run() exit(0) diff --git a/src/autotrain/cli/utils.py b/src/autotrain/cli/utils.py index bb87372210..b2e0858c32 100644 --- a/src/autotrain/cli/utils.py +++ b/src/autotrain/cli/utils.py @@ -380,11 +380,11 @@ def token_clf_munge_data(params, local): def img_clf_munge_data(params, local): train_data_path = f"{params.data_path}/{params.train_split}" - if params.valid_split is not None: - valid_data_path = f"{params.data_path}/{params.valid_split}" - else: - valid_data_path = None - if os.path.isdir(train_data_path) or os.path.isdir(valid_data_path): + # if params.valid_split is not None: + # valid_data_path = f"{params.data_path}/{params.valid_split}" + # else: + # valid_data_path = None + if os.path.isdir(train_data_path): raise Exception("Image classification is not yet supported for local datasets using the CLI. Please use UI.") return params diff --git a/src/autotrain/configparser.py b/src/autotrain/parser.py similarity index 61% rename from src/autotrain/configparser.py rename to src/autotrain/parser.py index cce980f941..940b0a6ecd 100644 --- a/src/autotrain/configparser.py +++ b/src/autotrain/parser.py @@ -1,8 +1,10 @@ import os from dataclasses import dataclass +import requests import yaml +from autotrain import logger from autotrain.cli.utils import ( dreambooth_munge_data, img_clf_munge_data, @@ -26,13 +28,20 @@ @dataclass -class ConfigParser: - config_file: str +class AutoTrainConfigParser: + config_path: str def __post_init__(self): - with open(self.config_file, "r") as f: - self.config = yaml.safe_load(f) - self.parsed_config = self._parse_config() + if self.config_path.startswith("http"): + response = requests.get(self.config_path) + if response.status_code == 200: + self.config = yaml.safe_load(response.content) + else: + raise ValueError("Failed to retrieve YAML file.") + else: + with open(self.config_path, "r") as f: + self.config = yaml.safe_load(f) + self.task_param_map = { "lm_training": LLMTrainingParams, "dreambooth": DreamBoothTrainingParams, @@ -43,46 +52,69 @@ def __post_init__(self): "text_binary_classification": TextClassificationParams, "text_multi_class_classification": TextClassificationParams, "text_single_column_regression": TextRegressionParams, - "token_classification": TokenClassificationParams, + "text_token_classification": TokenClassificationParams, } self.munge_data_map = { "lm_training": llm_munge_data, "dreambooth": dreambooth_munge_data, "tabular": tabular_munge_data, "seq2seq": seq2seq_munge_data, - "image_binary_classification": img_clf_munge_data, "image_multi_class_classification": img_clf_munge_data, - "text_binary_classification": text_clf_munge_data, "text_multi_class_classification": text_clf_munge_data, - "token_classification": token_clf_munge_data, + "text_token_classification": token_clf_munge_data, "text_single_column_regression": text_reg_munge_data, } - - def _parse_config(self): + self.task_aliases = { + "llm": "lm_training", + "llm_training": "lm_training", + "llm_finetuning": "lm_training", + "dreambooth": "dreambooth", + "image_binary_classification": "image_multi_class_classification", + "image_classification": "image_multi_class_classification", + "seq2seq": "seq2seq", + "tabular": "tabular", + "text_binary_classification": "text_multi_class_classification", + "text_classification": "text_multi_class_classification", + "text_single_column_regression": "text_single_column_regression", + "text_regression": "text_single_column_regression", + "token_classification": "text_token_classification", + } task = self.config.get("task") - if task is None: + self.task = self.task_aliases.get(task, task) + if self.task is None: raise ValueError("Task is required in the configuration file") - if task not in TASKS: - raise ValueError(f"Task `{task}` is not supported") + if self.task not in TASKS: + raise ValueError(f"Task `{self.task}` is not supported") + self.backend = self.config.get("backend") + if self.backend is None: + raise ValueError("Backend is required in the configuration file") + + logger.info(f"Running task: {self.task}") + logger.info(f"Using backend: {self.backend}") + + self.parsed_config = self._parse_config() + + def _parse_config(self): params = { "model": self.config["base_model"], "project_name": self.config["project_name"], - "log": self.config["log"], } - if task == "dreambooth": + if self.task == "dreambooth": params["image_path"] = self.config["data"]["path"] + params["prompt"] = self.config["data"]["prompt"] else: params["data_path"] = self.config["data"]["path"] - if task == "lm_training": + if self.task == "lm_training": params["chat_template"] = self.config["data"]["chat_template"] - if task != "dreambooth": + if self.task != "dreambooth": for k, v in self.config["data"]["column_mapping"].items(): params[k] = v params["train_split"] = self.config["data"]["train_split"] params["valid_split"] = self.config["data"]["valid_split"] + params["log"] = self.config["log"] if "hub" in self.config: params["username"] = self.config["hub"]["username"] @@ -108,15 +140,9 @@ def _parse_config(self): return params def run(self): - backend = self.config.get("backend") - task = self.config.get("task") - if backend is None: - raise ValueError("Backend is required in the configuration file") - if task is None: - raise ValueError("Task is required in the configuration file") - - _params = self.task_param_map[self.config["task"]](**self.parsed_config) - _munge_fn = self.munge_data_map[self.config["task"]] - _munge_fn(_params, local=backend.startswith("local")) - project = AutoTrainProject(params=_params, backend=backend) + _params = self.task_param_map[self.task](**self.parsed_config) + logger.info(_params) + _munge_fn = self.munge_data_map[self.task] + _munge_fn(_params, local=self.backend.startswith("local")) + project = AutoTrainProject(params=_params, backend=self.backend) _ = project.create()