add configs

huggingface · May 3, 2024 · eaaf8a2 · eaaf8a2
1 parent f39b93c
commit eaaf8a2
Show file tree

Hide file tree

Showing 15 changed files with 282 additions and 40 deletions.
diff --git a/configs/dreambooth/sd15_colab.yml b/configs/dreambooth/sd15_colab.yml
@@ -0,0 +1,24 @@
+task: dreambooth
+base_model: runwayml/stable-diffusion-v1-5
+project_name: autotrain-sd15-finetuned
+backend: local-cli
+
+data:
+  path: data/ # store all images in this folder
+  prompt: photo of sks person # prompt for the model
+
+params:
+  resolution: 512
+  batch_size: 1
+  num_steps: 500
+  lr: 1e-4
+  gradient_accumulation: 4
+  mixed_precision: fp16
+  train_text_encoder: false
+  xformers: false
+  use_8bit_adam: false
+
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
diff --git a/configs/dreambooth/sdxl_colab.yml b/configs/dreambooth/sdxl_colab.yml
@@ -0,0 +1,25 @@
+task: dreambooth
+base_model: stabilityai/stable-diffusion-xl-base-1.0
+project_name: autotrain-sdxl-finetuned
+backend: local-cli
+
+data:
+  path: data/ # store all images in this folder
+  prompt: photo of sks person # prompt for the model
+
+params:
+  resolution: 1024
+  batch_size: 1
+  num_steps: 500
+  lr: 1e-4
+  gradient_accumulation: 4
+  mixed_precision: fp16
+  train_text_encoder: false
+  xformers: false
+  use_8bit_adam: false
+  xl: true
+
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
diff --git a/configs/image_classification/hub_dataset.yml b/configs/image_classification/hub_dataset.yml
@@ -0,0 +1,27 @@
+task: image_classification
+base_model: google/vit-base-patch16-224
+project_name: autotrain-cats-vs-dogs-finetuned
+log: tensorboard
+backend: local-cli
+
+data:
+  path: cats_vs_dogs
+  train_split: train
+  valid_split: null
+  column_mapping:
+    image_column: image
+    target_column: labels
+
+params:
+  epochs: 2
+  batch_size: 4
+  lr: 2e-5
+  optimizer: adamw_torch
+  scheduler: linear
+  gradient_accumulation: 1
+  mixed_precision: fp16
+
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
diff --git a/configs/llm_finetuning/gpt2_sft.yml b/configs/llm_finetuning/gpt2_sft.yml
@@ -1,4 +1,4 @@
-task: lm_training
+task: llm
 base_model: openai-community/gpt2
 project_name: autotrain-gpt2-finetuned-guanaco
 log: tensorboard

diff --git a/configs/llm_finetuning/llama3-8b-orpo.yml b/configs/llm_finetuning/llama3-8b-orpo.yml
@@ -1,4 +1,4 @@
-task: lm_training
+task: llm
 base_model: meta-llama/Meta-Llama-3-8B-Instruct
 project_name: llama3-8b-orpo
 log: tensorboard

diff --git a/configs/text_classification/imdb_bert.yml → configs/text_classification/hub_dataset.yml b/configs/text_classification/imdb_bert.yml → configs/text_classification/hub_dataset.yml
@@ -1,4 +1,4 @@
-task: text_multi_class_classification
+task: text_classification
 base_model: google-bert/bert-base-uncased
 project_name: autotrain-bert-imdb-finetuned
 log: tensorboard

diff --git a/configs/text_classification/local_dataset.yml b/configs/text_classification/local_dataset.yml
@@ -0,0 +1,28 @@
+task: text_classification
+base_model: google-bert/bert-base-uncased
+project_name: autotrain-bert-imdb-finetuned
+log: tensorboard
+backend: local-cli
+
+data:
+  path: data/ # this must be the path to the directory containing the train and valid files
+  train_split: train # this must be either train.csv or train.json
+  valid_split: valid # this must be either valid.csv or valid.json
+  column_mapping:
+    text_column: text # this must be the name of the column containing the text
+    target_column: label # this must be the name of the column containing the target
+
+params:
+  max_seq_length: 512
+  epochs: 3
+  batch_size: 4
+  lr: 2e-5
+  optimizer: adamw_torch
+  scheduler: linear
+  gradient_accumulation: 1
+  mixed_precision: fp16
+
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
diff --git a/configs/text_regression/hub_dataset.yml b/configs/text_regression/hub_dataset.yml
@@ -0,0 +1,28 @@
+task: text_regression
+base_model: google-bert/bert-base-uncased
+project_name: autotrain-bert-sms-spam-finetuned
+log: tensorboard
+backend: local-cli
+
+data:
+  path: sms_spam
+  train_split: train
+  valid_split: null
+  column_mapping:
+    text_column: sms
+    target_column: label
+
+params:
+  max_seq_length: 512
+  epochs: 3
+  batch_size: 4
+  lr: 2e-5
+  optimizer: adamw_torch
+  scheduler: linear
+  gradient_accumulation: 1
+  mixed_precision: fp16
+
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
diff --git a/configs/text_regression/local_dataset.yml b/configs/text_regression/local_dataset.yml
@@ -0,0 +1,28 @@
+task: text_regression
+base_model: google-bert/bert-base-uncased
+project_name: autotrain-bert-custom-finetuned
+log: tensorboard
+backend: local-cli
+
+data:
+  path: data/ # this must be the path to the directory containing the train and valid files
+  train_split: train # this must be either train.csv or train.json
+  valid_split: valid # this must be either valid.csv or valid.json
+  column_mapping:
+    text_column: text # this must be the name of the column containing the text
+    target_column: label # this must be the name of the column containing the target
+
+params:
+  max_seq_length: 512
+  epochs: 3
+  batch_size: 4
+  lr: 2e-5
+  optimizer: adamw_torch
+  scheduler: linear
+  gradient_accumulation: 1
+  mixed_precision: fp16
+
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
diff --git a/configs/token_classification/hub_dataset.yml b/configs/token_classification/hub_dataset.yml
@@ -0,0 +1,28 @@
+task: token_classification
+base_model: google-bert/bert-base-uncased
+project_name: autotrain-bert-conll2003-finetuned
+log: tensorboard
+backend: local-cli
+
+data:
+  path: conll2003
+  train_split: train
+  valid_split: validation
+  column_mapping:
+    tokens_column: tokens
+    tags_column: ner_tags
+
+params:
+  max_seq_length: 512
+  epochs: 3
+  batch_size: 4
+  lr: 2e-5
+  optimizer: adamw_torch
+  scheduler: linear
+  gradient_accumulation: 1
+  mixed_precision: fp16
+
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
diff --git a/configs/token_classification/local_dataset.yml b/configs/token_classification/local_dataset.yml
@@ -0,0 +1,28 @@
+task: token_classification
+base_model: google-bert/bert-base-uncased
+project_name: autotrain-bert-custom-finetuned
+log: tensorboard
+backend: local-cli
+
+data:
+  path: data/ # this must be the path to the directory containing the train and valid files
+  train_split: train # this must be either train.json
+  valid_split: valid # this must be either valid.json, can also be set to null
+  column_mapping:
+    text_column: text # this must be the name of the column containing the text
+    target_column: label # this must be the name of the column containing the target
+
+params:
+  max_seq_length: 512
+  epochs: 3
+  batch_size: 4
+  lr: 2e-5
+  optimizer: adamw_torch
+  scheduler: linear
+  gradient_accumulation: 1
+  mixed_precision: fp16
+
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
diff --git a/src/autotrain/__init__.py b/src/autotrain/__init__.py
@@ -41,4 +41,4 @@
 
 
 logger = Logger().get_logger()
-__version__ = "0.7.77.dev0"
+__version__ = "0.7.78.dev0"
diff --git a/src/autotrain/cli/autotrain.py b/src/autotrain/cli/autotrain.py
@@ -14,7 +14,7 @@
 from autotrain.cli.run_text_regression import RunAutoTrainTextRegressionCommand
 from autotrain.cli.run_token_classification import RunAutoTrainTokenClassificationCommand
 from autotrain.cli.run_tools import RunAutoTrainToolsCommand
-from autotrain.configparser import ConfigParser
+from autotrain.parser import AutoTrainConfigParser
 
 
 def main():
@@ -50,7 +50,7 @@ def main():
 
     if args.config:
         logger.info(f"Using AutoTrain configuration: {args.config}")
-        cp = ConfigParser(args.config)
+        cp = AutoTrainConfigParser(args.config)
         cp.run()
         exit(0)
 

diff --git a/src/autotrain/cli/utils.py b/src/autotrain/cli/utils.py
@@ -380,11 +380,11 @@ def token_clf_munge_data(params, local):
 
 def img_clf_munge_data(params, local):
     train_data_path = f"{params.data_path}/{params.train_split}"
-    if params.valid_split is not None:
-        valid_data_path = f"{params.data_path}/{params.valid_split}"
-    else:
-        valid_data_path = None
-    if os.path.isdir(train_data_path) or os.path.isdir(valid_data_path):
+    # if params.valid_split is not None:
+    #     valid_data_path = f"{params.data_path}/{params.valid_split}"
+    # else:
+    #     valid_data_path = None
+    if os.path.isdir(train_data_path):
         raise Exception("Image classification is not yet supported for local datasets using the CLI. Please use UI.")
     return params
Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,4 +41,4 @@


		logger = Logger().get_logger()
		__version__ = "0.7.77.dev0"
		__version__ = "0.7.78.dev0"