Text regression (#605)

huggingface · Apr 29, 2024 · 15711ae · 15711ae
1 parent 00028d5
commit 15711ae
Show file tree

Hide file tree

Showing 20 changed files with 828 additions and 49 deletions.
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -17,6 +17,8 @@
 - sections:
   - local: text_classification
     title: Text Classification
+  - local: text_regression
+    title: Text Regression
   - local: llm_finetuning
     title: LLM Finetuning
   - local: image_classification

diff --git a/docs/source/text_regression.mdx b/docs/source/text_regression.mdx
@@ -0,0 +1,147 @@
+# Text Regression
+
+Training a text regression model with AutoTrain is super-easy! Get your data ready in
+proper format and then with just a few clicks, your state-of-the-art model will be ready to
+be used in production.
+
+## Data Format
+
+Let's train a model for scoring a movie review on a scale of 1-5. The data should be
+in the following CSV format:
+
+```csv
+text,target
+"this movie is great",5
+"this movie is bad",1
+.
+.
+.
+```
+
+As you can see, we have two columns in the CSV file. One column is the text and the other
+is the label. The label can be any float or int.
+
+If your CSV is huge, you can divide it into multiple CSV files and upload them separately.
+Please make sure that the column names are the same in all CSV files.
+
+One way to divide the CSV file using pandas is as follows:
+
+```python
+import pandas as pd
+
+# Set the chunk size
+chunk_size = 1000
+i = 1
+
+# Open the CSV file and read it in chunks
+for chunk in pd.read_csv('example.csv', chunksize=chunk_size):
+    # Save each chunk to a new file
+    chunk.to_csv(f'chunk_{i}.csv', index=False)
+    i += 1
+```
+
+Instead of CSV you can also use JSONL format. The JSONL format should be as follows:
+
+```json
+{"text": "this movie is great", "target": 5}
+{"text": "this movie is bad", "target": 1}
+.
+.
+.
+```
+
+## Columns
+
+Your CSV dataset must have two columns: `text` and `target`.
+
+
+### Params
+
+```
+❯ autotrain text-regression --help
+usage: autotrain <command> [<args>] text-regression [-h] [--train] [--deploy] [--inference] [--username USERNAME]
+                                                        [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}]
+                                                        [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME
+                                                        [--data-path DATA_PATH] [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT]
+                                                        [--batch-size BATCH_SIZE] [--seed SEED] [--epochs EPOCHS]
+                                                        [--gradient_accumulation GRADIENT_ACCUMULATION] [--disable_gradient_checkpointing] [--lr LR]
+                                                        [--log {none,wandb,tensorboard}] [--text-column TEXT_COLUMN] [--target-column TARGET_COLUMN]
+                                                        [--max-seq-length MAX_SEQ_LENGTH] [--warmup-ratio WARMUP_RATIO] [--optimizer OPTIMIZER]
+                                                        [--scheduler SCHEDULER] [--weight-decay WEIGHT_DECAY] [--max-grad-norm MAX_GRAD_NORM]
+                                                        [--logging-steps LOGGING_STEPS] [--evaluation-strategy {steps,epoch,no}]
+                                                        [--save-total-limit SAVE_TOTAL_LIMIT]
+                                                        [--auto-find-batch-size] [--mixed-precision {fp16,bf16,None}]
+
+✨ Run AutoTrain Text Regression
+
+options:
+  -h, --help            show this help message and exit
+  --train               Command to train the model
+  --deploy              Command to deploy the model (limited availability)
+  --inference           Command to run inference (limited availability)
+  --username USERNAME   Hugging Face Hub Username
+  --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}
+                        Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only.
+  --token TOKEN         Your Hugging Face API token. Token must have write access to the model hub.
+  --push-to-hub         Push to hub after training will push the trained model to the Hugging Face model hub.
+  --model MODEL         Base model to use for training
+  --project-name PROJECT_NAME
+                        Output directory / repo id for trained model (must be unique on hub)
+  --data-path DATA_PATH
+                        Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate
+                        formats
+  --train-split TRAIN_SPLIT
+                        Train dataset split to use
+  --valid-split VALID_SPLIT
+                        Validation dataset split to use
+  --batch-size BATCH_SIZE
+                        Training batch size to use
+  --seed SEED           Random seed for reproducibility
+  --epochs EPOCHS       Number of training epochs
+  --gradient_accumulation GRADIENT_ACCUMULATION
+                        Gradient accumulation steps
+  --disable_gradient_checkpointing
+                        Disable gradient checkpointing
+  --lr LR               Learning rate
+  --log {none,wandb,tensorboard}
+                        Use experiment tracking
+  --text-column TEXT_COLUMN
+                        Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields.
+                        Default is 'text'.
+  --target-column TARGET_COLUMN
+                        Specify the column name that holds the target or label data for training. Helps in distinguishing different potential
+                        outputs. Default is 'target'.
+  --max-seq-length MAX_SEQ_LENGTH
+                        Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are
+                        truncated. Affects both memory usage and computational requirements. Default is 128 tokens.
+  --warmup-ratio WARMUP_RATIO
+                        Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help
+                        in stabilizing the training process early on. Default ratio is 0.1.
+  --optimizer OPTIMIZER
+                        Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model
+                        performance. 'adamw_torch' is used by default.
+  --scheduler SCHEDULER
+                        Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the
+                        learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule.
+  --weight-decay WEIGHT_DECAY
+                        Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large
+                        weights. Default is 0.0, meaning no weight decay is applied.
+  --max-grad-norm MAX_GRAD_NORM
+                        Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient
+                        problem in deep neural networks. Default is 1.0.
+  --logging-steps LOGGING_STEPS
+                        Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging
+                        steps automatically. Default is -1.
+  --evaluation-strategy {steps,epoch,no}
+                        Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of
+                        each training epoch by default.
+  --save-total-limit SAVE_TOTAL_LIMIT
+                        Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints.
+                        Default is to save only the latest one.
+  --auto-find-batch-size
+                        Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch
+                        size that fits in memory.
+  --mixed-precision {fp16,bf16,None}
+                        Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for
+                        default precision. Default is None.
+```
diff --git a/src/autotrain/app.py b/src/autotrain/app.py
@@ -25,6 +25,7 @@
 from autotrain.trainers.seq2seq.params import Seq2SeqParams
 from autotrain.trainers.tabular.params import TabularParams
 from autotrain.trainers.text_classification.params import TextClassificationParams
+from autotrain.trainers.text_regression.params import TextRegressionParams
 from autotrain.trainers.token_classification.params import TokenClassificationParams
 
 
@@ -135,6 +136,10 @@
     mixed_precision="fp16",
     log="tensorboard",
 ).model_dump()
+PARAMS["text-regression"] = TextRegressionParams(
+    mixed_precision="fp16",
+    log="tensorboard",
+).model_dump()
 
 
 MODEL_CHOICE = fetch_models()
@@ -281,6 +286,18 @@ async def fetch_params(task: str, param_type: str):
                 "evaluation_strategy",
             ]
             task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params}
+        if task == "text-regression" and param_type == "basic":
+            more_hidden_params = [
+                "warmup_ratio",
+                "weight_decay",
+                "max_grad_norm",
+                "seed",
+                "logging_steps",
+                "auto_find_batch_size",
+                "save_total_limit",
+                "evaluation_strategy",
+            ]
+            task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params}
         if task == "image-classification" and param_type == "basic":
             more_hidden_params = [
                 "warmup_ratio",
@@ -394,6 +411,8 @@ async def fetch_model_choices(task: str, custom_models: str = Query(None)):
         hub_models = MODEL_CHOICE["tabular-regression"]
     elif task == "token-classification":
         hub_models = MODEL_CHOICE["token-classification"]
+    elif task == "text-regression":
+        hub_models = MODEL_CHOICE["text-regression"]
     else:
         raise NotImplementedError
 
@@ -514,6 +533,8 @@ async def handle_form(
                 dset_task = "lm_training"
             elif task == "text-classification":
                 dset_task = "text_multi_class_classification"
+            elif task == "text-regression":
+                dset_task = "text_single_column_regression"
             elif task == "seq2seq":
                 dset_task = "seq2seq"
             elif task.startswith("tabular"):

diff --git a/src/autotrain/app_params.py b/src/autotrain/app_params.py
@@ -8,6 +8,7 @@
 from autotrain.trainers.seq2seq.params import Seq2SeqParams
 from autotrain.trainers.tabular.params import TabularParams
 from autotrain.trainers.text_classification.params import TextClassificationParams
+from autotrain.trainers.text_regression.params import TextRegressionParams
 from autotrain.trainers.token_classification.params import TokenClassificationParams
 
 
@@ -44,6 +45,8 @@ def munge(self):
             return self._munge_params_llm()
         elif self.task == "token-classification":
             return self._munge_params_token_clf()
+        elif self.task == "text-regression":
+            return self._munge_params_text_reg()
         else:
             raise ValueError(f"Unknown task: {self.task}")
 
@@ -91,6 +94,21 @@ def _munge_params_text_clf(self):
             _params["valid_split"] = self.valid_split
         return TextClassificationParams(**_params)
 
+    def _munge_params_text_reg(self):
+        _params = self._munge_common_params()
+        _params["model"] = self.base_model
+        _params["log"] = "tensorboard"
+        if not self.using_hub_dataset:
+            _params["text_column"] = "autotrain_text"
+            _params["target_column"] = "autotrain_label"
+            _params["valid_split"] = "validation"
+        else:
+            _params["text_column"] = self.column_mapping.get("text", "text")
+            _params["target_column"] = self.column_mapping.get("label", "label")
+            _params["train_split"] = self.train_split
+            _params["valid_split"] = self.valid_split
+        return TextRegressionParams(**_params)
+
     def _munge_params_token_clf(self):
         _params = self._munge_common_params()
         _params["model"] = self.base_model

diff --git a/src/autotrain/app_utils.py b/src/autotrain/app_utils.py
@@ -15,6 +15,7 @@
 from autotrain.trainers.seq2seq.params import Seq2SeqParams
 from autotrain.trainers.tabular.params import TabularParams
 from autotrain.trainers.text_classification.params import TextClassificationParams
+from autotrain.trainers.text_regression.params import TextRegressionParams
 from autotrain.trainers.token_classification.params import TokenClassificationParams
 
 
@@ -124,6 +125,8 @@ def run_training(params, task_id, local=False, wait=False):
         params = ImageClassificationParams(**params)
     elif task_id == 4:
         params = TokenClassificationParams(**params)
+    elif task_id == 10:
+        params = TextRegressionParams(**params)
     else:
         raise NotImplementedError
 

diff --git a/src/autotrain/backend.py b/src/autotrain/backend.py
@@ -21,6 +21,7 @@
 from autotrain.trainers.seq2seq.params import Seq2SeqParams
 from autotrain.trainers.tabular.params import TabularParams
 from autotrain.trainers.text_classification.params import TextClassificationParams
+from autotrain.trainers.text_regression.params import TextRegressionParams
 from autotrain.trainers.token_classification.params import TokenClassificationParams
 
 
@@ -45,6 +46,7 @@ class SpaceRunner:
         DreamBoothTrainingParams,
         Seq2SeqParams,
         TokenClassificationParams,
+        TextRegressionParams,
     ]
     backend: str
 
@@ -96,6 +98,8 @@ def __post_init__(self):
             self.task_id = 18
         elif isinstance(self.params, TokenClassificationParams):
             self.task_id = 4
+        elif isinstance(self.params, TextRegressionParams):
+            self.task_id = 10
         else:
             raise NotImplementedError
 
@@ -132,6 +136,10 @@ def prepare(self):
             self.task_id = 4
             space_id = self._create_space()
             return space_id
+        if isinstance(self.params, TextRegressionParams):
+            self.task_id = 10
+            space_id = self._create_space()
+            return space_id
         raise NotImplementedError
 
     def _create_readme(self):

diff --git a/src/autotrain/cli/autotrain.py b/src/autotrain/cli/autotrain.py
@@ -11,6 +11,7 @@
 from .run_spacerunner import RunAutoTrainSpaceRunnerCommand
 from .run_tabular import RunAutoTrainTabularCommand
 from .run_text_classification import RunAutoTrainTextClassificationCommand
+from .run_text_regression import RunAutoTrainTextRegressionCommand
 from .run_token_classification import RunAutoTrainTokenClassificationCommand
 from .run_tools import RunAutoTrainToolsCommand
 
@@ -37,6 +38,7 @@ def main():
     RunAutoTrainSeq2SeqCommand.register_subcommand(commands_parser)
     RunAutoTrainTokenClassificationCommand.register_subcommand(commands_parser)
     RunAutoTrainToolsCommand.register_subcommand(commands_parser)
+    RunAutoTrainTextRegressionCommand.register_subcommand(commands_parser)
 
     args = parser.parse_args()