From 8e6c2ac3fba32d80918831876f6f670b116d547a Mon Sep 17 00:00:00 2001 From: abhishekkrthakur Date: Fri, 4 Oct 2024 14:38:53 +0200 Subject: [PATCH] text clf/reg --- docs/source/_toctree.yml | 4 +- docs/source/params/llm_finetuning_params.bck | 95 ----------- .../params/text_classification_params.bck | 3 - docs/source/tasks/llm_finetuning.mdx | 28 ++-- ...mdx => text_classification_regression.mdx} | 53 +++++-- docs/source/tasks/text_regression.mdx | 147 ------------------ 6 files changed, 55 insertions(+), 275 deletions(-) delete mode 100644 docs/source/params/llm_finetuning_params.bck delete mode 100644 docs/source/params/text_classification_params.bck rename docs/source/tasks/{text_classification.mdx => text_classification_regression.mdx} (69%) delete mode 100644 docs/source/tasks/text_regression.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 8020fe9171..456eaf8930 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -27,8 +27,8 @@ - sections: - local: tasks/llm_finetuning title: LLM Finetuning - - local: tasks/text_classification - title: Text Classification + - local: tasks/text_classification_regression + title: Text Classification/Regression - local: tasks/extractive_qa title: Extractive QA - local: tasks/sentence_transformer diff --git a/docs/source/params/llm_finetuning_params.bck b/docs/source/params/llm_finetuning_params.bck deleted file mode 100644 index db67624095..0000000000 --- a/docs/source/params/llm_finetuning_params.bck +++ /dev/null @@ -1,95 +0,0 @@ -# LLM Fine Tuning Parameters - -[[autodoc]] trainers.clm.params.LLMTrainingParams - -## Task specific parameters - - -The length parameters used for different trainers can be different. Some require more context than others. - -- block_size: This is the maximum sequence length or length of one block of text. Setting to -1 determines block size automatically. Default is -1. -- model_max_length: Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. Default is 1024 -- max_prompt_length: Specify the maximum length for prompts used in training, particularly relevant for tasks requiring initial contextual input. Used only for `orpo` and `dpo` trainer. -- max_completion_length: Completion length to use, for orpo: encoder-decoder models only. For dpo, it is the length of the completion text. - -**NOTE**: - - block size cannot be greater than model_max_length! - - max_prompt_length cannot be greater than model_max_length! - - max_prompt_length cannot be greater than block_size! - - max_completion_length cannot be greater than model_max_length! - - max_completion_length cannot be greater than block_size! - -**NOTE**: Not following these constraints will result in an error / nan losses. - -### Generic Trainer - -``` ---add_eos_token, --add-eos-token - Toggle whether to automatically add an End Of Sentence (EOS) token at the end of texts, which can be critical for certain - types of models like language models. Only used for `default` trainer ---block_size BLOCK_SIZE, --block-size BLOCK_SIZE - Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to - -1 determines block size automatically. Default is -1. ---model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH - Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. - Default is 1024 -``` - -### SFT Trainer - -``` ---block_size BLOCK_SIZE, --block-size BLOCK_SIZE - Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to - -1 determines block size automatically. Default is -1. ---model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH - Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. - Default is 1024 -``` - -### Reward Trainer - -``` ---block_size BLOCK_SIZE, --block-size BLOCK_SIZE - Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to - -1 determines block size automatically. Default is -1. ---model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH - Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. - Default is 1024 -``` - -### DPO Trainer - -``` ---dpo-beta DPO_BETA, --dpo-beta DPO_BETA - Beta for DPO trainer - ---model-ref MODEL_REF - Reference model to use for DPO when not using PEFT ---block_size BLOCK_SIZE, --block-size BLOCK_SIZE - Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to - -1 determines block size automatically. Default is -1. ---model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH - Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. - Default is 1024 ---max_prompt_length MAX_PROMPT_LENGTH, --max-prompt-length MAX_PROMPT_LENGTH - Specify the maximum length for prompts used in training, particularly relevant for tasks requiring initial contextual input. - Used only for `orpo` trainer. ---max_completion_length MAX_COMPLETION_LENGTH, --max-completion-length MAX_COMPLETION_LENGTH - Completion length to use, for orpo: encoder-decoder models only -``` - -### ORPO Trainer - -``` ---block_size BLOCK_SIZE, --block-size BLOCK_SIZE - Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to - -1 determines block size automatically. Default is -1. ---model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH - Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. - Default is 1024 ---max_prompt_length MAX_PROMPT_LENGTH, --max-prompt-length MAX_PROMPT_LENGTH - Specify the maximum length for prompts used in training, particularly relevant for tasks requiring initial contextual input. - Used only for `orpo` trainer. ---max_completion_length MAX_COMPLETION_LENGTH, --max-completion-length MAX_COMPLETION_LENGTH - Completion length to use, for orpo: encoder-decoder models only -``` diff --git a/docs/source/params/text_classification_params.bck b/docs/source/params/text_classification_params.bck deleted file mode 100644 index 5c07792369..0000000000 --- a/docs/source/params/text_classification_params.bck +++ /dev/null @@ -1,3 +0,0 @@ -# Text Classification & Regression Parameters - -[[autodoc]] trainers.text_classification.params.TextClassificationParams diff --git a/docs/source/tasks/llm_finetuning.mdx b/docs/source/tasks/llm_finetuning.mdx index ba45230e5a..964fc2f7b3 100644 --- a/docs/source/tasks/llm_finetuning.mdx +++ b/docs/source/tasks/llm_finetuning.mdx @@ -12,12 +12,12 @@ Config file task names: - `llm-dpo`: DPO trainer - `llm-orpo`: ORPO trainer -# Data Preparation +## Data Preparation LLM finetuning accepts data in CSV and JSONL formats. JSONL is the preferred format. How data is formatted depends on the task you are training the LLM for. -## Classic Text Generation +### Classic Text Generation For text generation, the data should be in the following format: @@ -38,7 +38,7 @@ Compatible trainers: - SFT Trainer - Generic Trainer -## Chatbot / question-answering / code generation / function calling +### Chatbot / question-answering / code generation / function calling For this task, you can use CSV or JSONL data. If you are formatting the data yourself (adding start, end tokens, etc.), you can use CSV or JSONL format. If you do not want to format the data yourself and want `--chat-template` parameter to format the data for you, you must use JSONL format. @@ -146,9 +146,9 @@ Chat models can be trained using the following trainers: The only difference between the data format for reward trainer and DPO/ORPO trainer is that the reward trainer requires only `text` and `rejected_text` columns, while the DPO/ORPO trainer requires an additional `prompt` column. -# Training +## Training -## Local Training +### Local Training Locally the training can be performed by using `autotrain --config config.yaml` command. The `config.yaml` file should contain the following parameters: @@ -222,7 +222,7 @@ $ autotrain --config config.yaml More example config files for finetuning different types of lllm and different tasks can be found in the [here](https://github.com/huggingface/autotrain-advanced/tree/main/configs/llm_finetuning). -## Training in Hugging Face Spaces +### Training in Hugging Face Spaces If you are training in Hugging Face Spaces, everything is the same as local training: @@ -232,13 +232,13 @@ In the UI, you need to make sure you select the right model, the dataset and the Once you are happy with the parameters, you can click on the `Start Training` button to start the training process. -# Parameters +## Parameters -## LLM Fine Tuning Parameters +### LLM Fine Tuning Parameters [[autodoc]] trainers.clm.params.LLMTrainingParams -## Task specific parameters +### Task specific parameters The length parameters used for different trainers can be different. Some require more context than others. @@ -257,7 +257,7 @@ The length parameters used for different trainers can be different. Some require **NOTE**: Not following these constraints will result in an error / nan losses. -### Generic Trainer +#### Generic Trainer ``` --add_eos_token, --add-eos-token @@ -271,7 +271,7 @@ The length parameters used for different trainers can be different. Some require Default is 1024 ``` -### SFT Trainer +#### SFT Trainer ``` --block_size BLOCK_SIZE, --block-size BLOCK_SIZE @@ -282,7 +282,7 @@ The length parameters used for different trainers can be different. Some require Default is 1024 ``` -### Reward Trainer +#### Reward Trainer ``` --block_size BLOCK_SIZE, --block-size BLOCK_SIZE @@ -293,7 +293,7 @@ The length parameters used for different trainers can be different. Some require Default is 1024 ``` -### DPO Trainer +#### DPO Trainer ``` --dpo-beta DPO_BETA, --dpo-beta DPO_BETA @@ -314,7 +314,7 @@ The length parameters used for different trainers can be different. Some require Completion length to use, for orpo: encoder-decoder models only ``` -### ORPO Trainer +#### ORPO Trainer ``` --block_size BLOCK_SIZE, --block-size BLOCK_SIZE diff --git a/docs/source/tasks/text_classification.mdx b/docs/source/tasks/text_classification_regression.mdx similarity index 69% rename from docs/source/tasks/text_classification.mdx rename to docs/source/tasks/text_classification_regression.mdx index 098c63770a..b028278008 100644 --- a/docs/source/tasks/text_classification.mdx +++ b/docs/source/tasks/text_classification_regression.mdx @@ -1,18 +1,21 @@ -# Text Classification +# Text Classification & Regression -Training a text classification model with AutoTrain is super-easy! Get your data ready in +Training a text classification/regression model with AutoTrain is super-easy! Get your data ready in proper format and then with just a few clicks, your state-of-the-art model will be ready to be used in production. Config file task names: -- `text_classification`` +- `text_classification` - `text-classification` +- `text_regression` +- `text-regression` -# Data Format +## Data Format -Text classification supports datasets in both CSV and JSONL formats. +Text classification/regression supports datasets in both CSV and JSONL formats. + +### CSV Format -## CSV Format Let's train a model for classifying the sentiment of a movie review. The data should be in the following CSV format: @@ -29,8 +32,18 @@ As you can see, we have two columns in the CSV file. One column is the text and is the label. The label can be any string. In this example, we have two labels: `positive` and `negative`. You can have as many labels as you want. +And if you would like to train a model for scoring a movie review on a scale of 1-5. The data can be as follows: + +```csv +text,target +"this movie is great",4.9 +"this movie is bad",1.5 +. +. +. +``` -## JSONL Format +### JSONL Format Instead of CSV you can also use JSONL format. The JSONL format should be as follows: ```json @@ -41,21 +54,27 @@ Instead of CSV you can also use JSONL format. The JSONL format should be as foll . ``` -## Columns +and for regression: + +```json +{"text": "this movie is great", "target": 4.9} +{"text": "this movie is bad", "target": 1.5} + +### Column Mapping / Names Your CSV dataset must have two columns: `text` and `target`. If your column names are different than `text` and `target`, you can map the dataset column to AutoTrain column names. -# Training +## Training -## Local Training +### Local Training -To train a text classification model locally, you can use the `autotrain --config config.yaml` command. +To train a text classification/regression model locally, you can use the `autotrain --config config.yaml` command. Here is an example of a `config.yaml` file for training a text classification model: ```yaml -task: text_classification +task: text_classification # or text_regression base_model: google-bert/bert-base-uncased project_name: autotrain-bert-imdb-finetuned log: tensorboard @@ -109,7 +128,9 @@ To train the model, run the following command: $ autotrain --config config.yaml ``` -## Training on Hugging Face Spaces +You can find example config files for text classification and regression in the [here](https://github.com/huggingface/autotrain-advanced/tree/main/configs/text_classification) and [here](https://github.com/huggingface/autotrain-advanced/tree/main/configs/text_regression) respectively. + +### Training on Hugging Face Spaces The parameters for training on Hugging Face Spaces are the same as for local training. If you are using your own dataset, select "Local" as dataset source and upload your dataset. @@ -117,6 +138,10 @@ In the following screenshot, we are training a text classification model using t ![AutoTrain Text Classification on Hugging Face Spaces](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/autotrain_text_classification.png) -# Parameters +For text regression, all you need to do is select "Text Regression" as the task and everything else remains the same (except the data, of course). + +## Training Parameters + +Training parameters for text classification and regression are the same. [[autodoc]] trainers.text_classification.params.TextClassificationParams diff --git a/docs/source/tasks/text_regression.mdx b/docs/source/tasks/text_regression.mdx deleted file mode 100644 index 04299b5f42..0000000000 --- a/docs/source/tasks/text_regression.mdx +++ /dev/null @@ -1,147 +0,0 @@ -# Text Regression - -Training a text regression model with AutoTrain is super-easy! Get your data ready in -proper format and then with just a few clicks, your state-of-the-art model will be ready to -be used in production. - -## Data Format - -Let's train a model for scoring a movie review on a scale of 1-5. The data should be -in the following CSV format: - -```csv -text,target -"this movie is great",5 -"this movie is bad",1 -. -. -. -``` - -As you can see, we have two columns in the CSV file. One column is the text and the other -is the label. The label can be any float or int. - -If your CSV is huge, you can divide it into multiple CSV files and upload them separately. -Please make sure that the column names are the same in all CSV files. - -One way to divide the CSV file using pandas is as follows: - -```python -import pandas as pd - -# Set the chunk size -chunk_size = 1000 -i = 1 - -# Open the CSV file and read it in chunks -for chunk in pd.read_csv('example.csv', chunksize=chunk_size): - # Save each chunk to a new file - chunk.to_csv(f'chunk_{i}.csv', index=False) - i += 1 -``` - -Instead of CSV you can also use JSONL format. The JSONL format should be as follows: - -```json -{"text": "this movie is great", "target": 5} -{"text": "this movie is bad", "target": 1} -. -. -. -``` - -## Columns - -Your CSV dataset must have two columns: `text` and `target`. - - -### Params - -``` -❯ autotrain text-regression --help -usage: autotrain [] text-regression [-h] [--train] [--deploy] [--inference] [--username USERNAME] - [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}] - [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME - [--data-path DATA_PATH] [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT] - [--batch-size BATCH_SIZE] [--seed SEED] [--epochs EPOCHS] - [--gradient_accumulation GRADIENT_ACCUMULATION] [--disable_gradient_checkpointing] [--lr LR] - [--log {none,wandb,tensorboard}] [--text-column TEXT_COLUMN] [--target-column TARGET_COLUMN] - [--max-seq-length MAX_SEQ_LENGTH] [--warmup-ratio WARMUP_RATIO] [--optimizer OPTIMIZER] - [--scheduler SCHEDULER] [--weight-decay WEIGHT_DECAY] [--max-grad-norm MAX_GRAD_NORM] - [--logging-steps LOGGING_STEPS] [--eval-strategy {steps,epoch,no}] - [--save-total-limit SAVE_TOTAL_LIMIT] - [--auto-find-batch-size] [--mixed-precision {fp16,bf16,None}] - -✨ Run AutoTrain Text Regression - -options: - -h, --help show this help message and exit - --train Command to train the model - --deploy Command to deploy the model (limited availability) - --inference Command to run inference (limited availability) - --username USERNAME Hugging Face Hub Username - --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf} - Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only. - --token TOKEN Your Hugging Face API token. Token must have write access to the model hub. - --push-to-hub Push to hub after training will push the trained model to the Hugging Face model hub. - --model MODEL Base model to use for training - --project-name PROJECT_NAME - Output directory / repo id for trained model (must be unique on hub) - --data-path DATA_PATH - Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate - formats - --train-split TRAIN_SPLIT - Train dataset split to use - --valid-split VALID_SPLIT - Validation dataset split to use - --batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --epochs EPOCHS Number of training epochs - --gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps - --disable_gradient_checkpointing - Disable gradient checkpointing - --lr LR Learning rate - --log {none,wandb,tensorboard} - Use experiment tracking - --text-column TEXT_COLUMN - Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields. - Default is 'text'. - --target-column TARGET_COLUMN - Specify the column name that holds the target or label data for training. Helps in distinguishing different potential - outputs. Default is 'target'. - --max-seq-length MAX_SEQ_LENGTH - Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are - truncated. Affects both memory usage and computational requirements. Default is 128 tokens. - --warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. - --optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. - --scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. - --weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. - --max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. - --logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. - --eval-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. - --save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. - --auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. - --mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` \ No newline at end of file