diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 726647003c..d7f97bf1db 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -1,19 +1,27 @@ - sections: - local: index title: 🤗 AutoTrain - - local: getting_started - title: Installation - local: cost title: How much does it cost? - local: support title: Get help and support + - local: faq + title: Frequently Asked Questions title: Getting Started - sections: - - local: starting_ui - title: Starting the UI - - local: starting_cli - title: Starting the CLI - title: Starting AutoTrain + - local: quickstart_spaces + title: Quickstart + title: AutoTrain on Hugging Face Spaces +- sections: + - local: quickstart + title: Quickstart + - local: config + title: Configurations + title: Use AutoTrain Locally +- sections: + - local: col_map + title: Understanding Column Mapping + title: Miscellaneous - sections: - local: text_classification title: Text Classification @@ -31,4 +39,20 @@ title: Token Classification - local: tabular title: Tabular - title: Tasks \ No newline at end of file + title: Data Formats +- sections: + - local: text_classification_params + title: Text Classification & Regression + - local: llm_finetuning_params + title: LLM Finetuning + - local: image_classification_params + title: Image Classification + - local: dreambooth_params + title: DreamBooth + - local: seq2seq_params + title: Seq2Seq + - local: token_classification_params + title: Token Classification + - local: tabular_params + title: Tabular + title: Parameters \ No newline at end of file diff --git a/docs/source/col_map.mdx b/docs/source/col_map.mdx new file mode 100644 index 0000000000..d681ae2c57 --- /dev/null +++ b/docs/source/col_map.mdx @@ -0,0 +1,162 @@ +# Understanding Column Mapping + +Column mapping is a critical setup process in AutoTrain that informs the system +about the roles of different columns in your dataset. Whether it's a tabular +dataset, text classification data, or another type, the need for precise +column mapping ensures that AutoTrain processes each dataset element correctly. + +## How Column Mapping Works + +AutoTrain has no way of knowing what the columns in your dataset represent. +AutoTrain requires a clear understanding of each column's function within +your dataset to train models effectively. This is managed through a +straightforward mapping system in the user interface, represented as a dictionary. +Here's a typical example: + +``` +{"text": "text", "label": "target"} +``` + +In this example, the `text column in your dataset corresponds to the text data +AutoTrain uses for processing, and the `target`` column is treated as the +label for training. + +But let's not get confused! AutoTrain has a way to understand what each column in your dataset represents. +If your data is already in AutoTrain format, you dont need to change column mappings. +If not, you can easily map the columns in your dataset to the correct AutoTrain format. + +In the UI, you will see column mapping as a dictionary: + +``` +{"text": "text", "label": "target"} +``` + +Here, the column `text` in your dataset is mapped to the AutoTrain column `text`, +and the column `target` in your dataset is mapped to the AutoTrain column `label`. + +Let's say you are training a text classification model and your dataset has the following columns: + +``` +full_text, target_sentiment +"this movie is great", positive +"this movie is bad", negative +``` + +You can map these columns to the AutoTrain format as follows: + +``` +{"text": "full_text", "label": "target_sentiment"} +``` + +If your dataset has the columns: `text` and `label`, you don't need to change the column mapping. + +Let's take a look at column mappings for each task: + +## LLM + +Note: For all LLM tasks, if the text column(s) is not formatted i.e. if contains samples in chat format (dict or json), then you +should use `chat_template` parameter. Read more about it in LLM Parameters Section. + + +### SFT / Generic Trainer + +``` +{"text": "text"} +``` + +`text`: The column in your dataset that contains the text data. + + +### Reward / ORPO Trainer + +``` +{"text": "text", "rejected_text": "rejected_text"} +``` + +`text`: The column in your dataset that contains the text data. + +`rejected_text`: The column in your dataset that contains the rejected text data. + +### DPO Trainer + +``` +{"prompt": "prompt", "text": "text", "rejected_text": "rejected_text"} +``` + +`prompt`: The column in your dataset that contains the prompt data. + +`text`: The column in your dataset that contains the text data. + +`rejected_text`: The column in your dataset that contains the rejected text data. + + +## Text Classification & Regression, Seq2Seq + +For text classification and regression, the column mapping should be as follows: + +``` +{"text": "dataset_text_column", "label": "dataset_target_column"} +``` + +`text`: The column in your dataset that contains the text data. + +`label`: The column in your dataset that contains the target variable. + + +## Token Classification + + +``` +{"text": "tokens", "label": "tags"} +``` + +`text`: The column in your dataset that contains the tokens. These tokens must be a list of strings. + +`label`: The column in your dataset that contains the tags. These tags must be a list of strings. + +For token classification, if you are using a CSV, make sure that the columns are stringified lists. + +## Tabular Classification & Regression + +``` +{"id": "id", "label": ["target"]} +``` + +`id`: The column in your dataset that contains the unique identifier for each row. + +`label`: The column in your dataset that contains the target variable. This should be a list of strings. + +For a single target column, you can pass a list with a single element. + +For multiple target columns, e.g. a multi label classification task, you can pass a list with multiple elements. + + +# DreamBooth LoRA + +Dreambooth doesn't require column mapping. + +# Image Classification + +For image classification, the column mapping should be as follows: + +``` +{"image": "image_column", "label": "label_column"} +``` + +Image classification requires column mapping only when you are using a dataset from Hugging Face Hub. +For uploaded datasets, leave column mapping as it is. + +## Ensuring Accurate Mapping + +To ensure your model trains correctly: + +- Verify Column Names: Double-check that the names used in the mapping dictionary accurately reflect those in your dataset. + +- Format Appropriately: Especially in token classification, ensure your data format matches expectations (e.g., lists of strings). + +- Update Mappings for New Datasets: Each new dataset might require its unique mappings based on its structure and the task at hand. + +By following these guidelines and using the provided examples as templates, +you can effectively instruct AutoTrain on how to interpret and handle your +data for various machine learning tasks. This process is fundamental for +achieving optimal results from your model training endeavors. diff --git a/docs/source/config.mdx b/docs/source/config.mdx new file mode 100644 index 0000000000..e255229b6e --- /dev/null +++ b/docs/source/config.mdx @@ -0,0 +1,65 @@ +# AutoTrain Configs + +AutoTrain Configs are the way to use and train models using AutoTrain locally. + +Once you have installed AutoTrain Advanced, you can use the following command to train models using AutoTrain config files: + +```bash +$ export HF_USERNAME=your_hugging_face_username +$ export HF_TOKEN=your_hugging_face_write_token + +$ autotrain --config path/to/config.yaml +``` + +Example configurations for all tasks can be found in the `configs` directory of +the [AutoTrain Advanced GitHub repository](https://github.com/huggingface/autotrain-advanced). + +Here is an example of an AutoTrain config file: + +```yaml +task: llm +base_model: meta-llama/Meta-Llama-3-8B-Instruct +project_name: autotrain-llama3-8b-orpo +log: tensorboard +backend: local + +data: + path: argilla/distilabel-capybara-dpo-7k-binarized + train_split: train + valid_split: null + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + +params: + trainer: orpo + block_size: 1024 + model_max_length: 2048 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: bf16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true +``` + +In this config, we are finetuning the `meta-llama/Meta-Llama-3-8B-Instruct` model +on the `argilla/distilabel-capybara-dpo-7k-binarized` dataset using the `orpo` +trainer for 3 epochs with a batch size of 2 and a learning rate of `3e-5`. +More information on the available parameters can be found in the *Data Formats and Parameters* section. + +In case you dont want to push the model to hub, you can set `push_to_hub` to `false` in the config file. +If not pushing the model to hub username and token are not required. Note: they may still be needed +if you are trying to access gated models or datasets. \ No newline at end of file diff --git a/docs/source/cost.mdx b/docs/source/cost.mdx index 698d9b45c5..4c9b4ef17f 100644 --- a/docs/source/cost.mdx +++ b/docs/source/cost.mdx @@ -1,11 +1,40 @@ # How much does it cost? -AutoTrain provides you with best models which are deployable with just a few clicks. -Unlike other services, we don't own your models. Once the training is done, you can download them and use them anywhere you want. +AutoTrain offers an accessible approach to model training, providing deployable models +with just a few clicks. Understanding the cost involved is essential to planning and +executing your projects efficiently. -You will be charged per minute based on the hardware you choose. -Pricing information is available in the [pricing](https://huggingface.co/pricing#spaces) section. +## Local Usage -Please note that in order to use AutoTrain, you need to have a valid payment method on file. -You can add your payment method in the [billing](https://huggingface.co/settings/billing) section. +When you choose to use AutoTrain locally on your own hardware, there is no cost. +This option is ideal for those who prefer to manage their own infrastructure and +do not require the scalability that cloud resources offer. + +## Using AutoTrain on Hugging Face Spaces + +**Pay-As-You-Go**: Costs for using AutoTrain in Hugging Face Spaces are based on the +computing resources you consume. This flexible pricing structure ensures you only pay +for what you use, making it cost-effective and scalable for projects of any size. + + +**Ownership and Portability**: Unlike some other platforms, AutoTrain does not retain +ownership of your models. Once training is complete, you are free to download and +deploy your models wherever you choose, providing flexibility and control over your all your assets. + +### Pricing Details + +**Resource-Based Billing**: Charges are accrued per minute according to the type of hardware +utilized during training. This means you can scale your resource usage based on the +complexity and needs of your projects. + +For a detailed breakdown of the costs associated with using Hugging Face Spaces, +please refer to the [pricing](https://huggingface.co/pricing#spaces) section on our website. + +To access the paid features of AutoTrain, you must have a valid payment method on file. +You can manage your payment options and view your billing information in +the [billing section of your Hugging Face account settings.](https://huggingface.co/settings/billing) + +By offering both free and flexible paid options, AutoTrain ensures that users can choose +the most suitable model training solution for their needs, whether they are experimenting +on a local machine or scaling up operations on Hugging Face Spaces. diff --git a/docs/source/dreambooth.mdx b/docs/source/dreambooth.mdx index 4ebe370b50..9c5321567e 100644 --- a/docs/source/dreambooth.mdx +++ b/docs/source/dreambooth.mdx @@ -1,151 +1,35 @@ # DreamBooth -DreamBooth is a method to personalize text-to-image models like Stable Diffusion given just a few (3-5) images of a subject. It allows the model to generate contextualized images of the subject in different scenes, poses, and views. +DreamBooth is an innovative method that allows for the customization of text-to-image +models like Stable Diffusion using just a few images of a subject. +DreamBooth enables the generation of new, contextually varied images of the +subject in a range of scenes, poses, and viewpoints, expanding the creative +possibilities of generative models. + ## Data Preparation The data format for DreamBooth training is simple. All you need is images of a concept (e.g. a person) and a concept token. -To train a dreambooth model, please select an appropriate model from the hub. -When choosing a model from the hub, please make sure you select the correct image size compatible with the model. +### Step 1: Gather Your Images + +Collect 3-5 high-quality images of the subject you wish to personalize. +These images should vary slightly in pose or background to provide the model with a +diverse learning set. You can select more images if you want to train a more robust model. + -Your concept token is `prompt` in parameters section. +### Step 2: Select Your Model -## Parameters +Choose a base model from the Hugging Face Hub that is compatible with your needs. +It's essential to select a model that supports the image size of your training data. +Models available on the hub often have specific requirements or capabilities, +so ensure the model you choose can accommodate the dimensions of your images. -``` -❯ autotrain dreambooth --help -usage: autotrain [] dreambooth [-h] [--train] [--deploy] [--inference] [--username USERNAME] - [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}] - [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME [--data-path DATA_PATH] - [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT] [--batch-size BATCH_SIZE] [--seed SEED] - [--epochs EPOCHS] [--gradient_accumulation GRADIENT_ACCUMULATION] [--disable_gradient_checkpointing] - [--lr LR] [--log {none,wandb,tensorboard}] [--revision REVISION] [--tokenizer TOKENIZER] --image-path - IMAGE_PATH [--class-image-path CLASS_IMAGE_PATH] --prompt PROMPT [--class-prompt CLASS_PROMPT] - [--num-class-images NUM_CLASS_IMAGES] [--class-labels-conditioning CLASS_LABELS_CONDITIONING] - [--prior-preservation] [--prior-loss-weight PRIOR_LOSS_WEIGHT] --resolution RESOLUTION - [--center-crop] [--train-text-encoder] [--sample-batch-size SAMPLE_BATCH_SIZE] - [--num-steps NUM_STEPS] [--checkpointing-steps CHECKPOINTING_STEPS] - [--resume-from-checkpoint RESUME_FROM_CHECKPOINT] [--scale-lr] [--scheduler SCHEDULER] - [--warmup-steps WARMUP_STEPS] [--num-cycles NUM_CYCLES] [--lr-power LR_POWER] - [--dataloader-num-workers DATALOADER_NUM_WORKERS] [--use-8bit-adam] [--adam-beta1 ADAM_BETA1] - [--adam-beta2 ADAM_BETA2] [--adam-weight-decay ADAM_WEIGHT_DECAY] [--adam-epsilon ADAM_EPSILON] - [--max-grad-norm MAX_GRAD_NORM] [--allow-tf32] - [--prior-generation-precision PRIOR_GENERATION_PRECISION] [--local-rank LOCAL_RANK] [--xformers] - [--pre-compute-text-embeddings] [--tokenizer-max-length TOKENIZER_MAX_LENGTH] - [--text-encoder-use-attention-mask] [--rank RANK] [--xl] [--mixed-precision MIXED_PRECISION] - [--validation-prompt VALIDATION_PROMPT] [--num-validation-images NUM_VALIDATION_IMAGES] - [--validation-epochs VALIDATION_EPOCHS] [--checkpoints-total-limit CHECKPOINTS_TOTAL_LIMIT] - [--validation-images VALIDATION_IMAGES] [--logging] -✨ Run AutoTrain DreamBooth Training +### Step 3: Define Your Concept Token -options: - -h, --help show this help message and exit - --train Command to train the model - --deploy Command to deploy the model (limited availability) - --inference Command to run inference (limited availability) - --username USERNAME Hugging Face Hub Username - --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf} - Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only. - --token TOKEN Your Hugging Face API token. Token must have write access to the model hub. - --push-to-hub Push to hub after training will push the trained model to the Hugging Face model hub. - --model MODEL Base model to use for training - --project-name PROJECT_NAME - Output directory / repo id for trained model (must be unique on hub) - --data-path DATA_PATH - Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate - formats - --train-split TRAIN_SPLIT - Train dataset split to use - --valid-split VALID_SPLIT - Validation dataset split to use - --batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --epochs EPOCHS Number of training epochs - --gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps - --disable_gradient_checkpointing - Disable gradient checkpointing - --lr LR Learning rate - --log {none,wandb,tensorboard} - Use experiment tracking - --revision REVISION Model revision to use for training - --tokenizer TOKENIZER - Tokenizer to use for training - --image-path IMAGE_PATH - Path to the images - --class-image-path CLASS_IMAGE_PATH - Path to the class images - --prompt PROMPT Instance prompt - --class-prompt CLASS_PROMPT - Class prompt - --num-class-images NUM_CLASS_IMAGES - Number of class images - --class-labels-conditioning CLASS_LABELS_CONDITIONING - Class labels conditioning - --prior-preservation With prior preservation - --prior-loss-weight PRIOR_LOSS_WEIGHT - Prior loss weight - --resolution RESOLUTION - Resolution - --center-crop Center crop - --train-text-encoder Train text encoder - --sample-batch-size SAMPLE_BATCH_SIZE - Sample batch size - --num-steps NUM_STEPS - Max train steps - --checkpointing-steps CHECKPOINTING_STEPS - Checkpointing steps - --resume-from-checkpoint RESUME_FROM_CHECKPOINT - Resume from checkpoint - --scale-lr Scale learning rate - --scheduler SCHEDULER - Learning rate scheduler - --warmup-steps WARMUP_STEPS - Learning rate warmup steps - --num-cycles NUM_CYCLES - Learning rate num cycles - --lr-power LR_POWER Learning rate power - --dataloader-num-workers DATALOADER_NUM_WORKERS - Dataloader num workers - --use-8bit-adam Use 8bit adam - --adam-beta1 ADAM_BETA1 - Adam beta 1 - --adam-beta2 ADAM_BETA2 - Adam beta 2 - --adam-weight-decay ADAM_WEIGHT_DECAY - Adam weight decay - --adam-epsilon ADAM_EPSILON - Adam epsilon - --max-grad-norm MAX_GRAD_NORM - Max grad norm - --allow-tf32 Allow TF32 - --prior-generation-precision PRIOR_GENERATION_PRECISION - Prior generation precision - --local-rank LOCAL_RANK - Local rank - --xformers Enable xformers memory efficient attention - --pre-compute-text-embeddings - Pre compute text embeddings - --tokenizer-max-length TOKENIZER_MAX_LENGTH - Tokenizer max length - --text-encoder-use-attention-mask - Text encoder use attention mask - --rank RANK Rank - --xl XL - --mixed-precision MIXED_PRECISION - mixed precision, fp16, bf16, none - --validation-prompt VALIDATION_PROMPT - Validation prompt - --num-validation-images NUM_VALIDATION_IMAGES - Number of validation images - --validation-epochs VALIDATION_EPOCHS - Validation epochs - --checkpoints-total-limit CHECKPOINTS_TOTAL_LIMIT - Checkpoints total limit - --validation-images VALIDATION_IMAGES - Validation images - --logging Logging using tensorboard -``` \ No newline at end of file +The concept token is a crucial element in DreamBooth training. +This token acts as a unique identifier for your subject within the model. +Typically, you will use a simple, descriptive keyword like prompt in the parameters +section of your training setup. This token will be used to generate new images of +your subject by the model. diff --git a/docs/source/dreambooth_params.mdx b/docs/source/dreambooth_params.mdx new file mode 100644 index 0000000000..7dc3fd9cd8 --- /dev/null +++ b/docs/source/dreambooth_params.mdx @@ -0,0 +1,81 @@ +## DreamBooth Parameters + +``` + --batch-size BATCH_SIZE + Training batch size to use + --seed SEED Random seed for reproducibility + --epochs EPOCHS Number of training epochs + --gradient_accumulation GRADIENT_ACCUMULATION + Gradient accumulation steps + --disable_gradient_checkpointing + Disable gradient checkpointing + --lr LR Learning rate + --tokenizer TOKENIZER + Tokenizer to use for training + --class-image-path CLASS_IMAGE_PATH + Path to the class images + --prompt PROMPT Instance prompt + --prior-preservation With prior preservation + --prior-loss-weight PRIOR_LOSS_WEIGHT + Prior loss weight + --resolution RESOLUTION + Resolution + --center-crop Center crop + --train-text-encoder Train text encoder + --sample-batch-size SAMPLE_BATCH_SIZE + Sample batch size + --num-steps NUM_STEPS + Max train steps + --checkpointing-steps CHECKPOINTING_STEPS + Checkpointing steps + --resume-from-checkpoint RESUME_FROM_CHECKPOINT + Resume from checkpoint + --scale-lr Scale learning rate + --scheduler SCHEDULER + Learning rate scheduler + --warmup-steps WARMUP_STEPS + Learning rate warmup steps + --num-cycles NUM_CYCLES + Learning rate num cycles + --lr-power LR_POWER Learning rate power + --dataloader-num-workers DATALOADER_NUM_WORKERS + Dataloader num workers + --use-8bit-adam Use 8bit adam + --adam-beta1 ADAM_BETA1 + Adam beta 1 + --adam-beta2 ADAM_BETA2 + Adam beta 2 + --adam-weight-decay ADAM_WEIGHT_DECAY + Adam weight decay + --adam-epsilon ADAM_EPSILON + Adam epsilon + --max-grad-norm MAX_GRAD_NORM + Max grad norm + --allow-tf32 Allow TF32 + --prior-generation-precision PRIOR_GENERATION_PRECISION + Prior generation precision + --local-rank LOCAL_RANK + Local rank + --xformers Enable xformers memory efficient attention + --pre-compute-text-embeddings + Pre compute text embeddings + --tokenizer-max-length TOKENIZER_MAX_LENGTH + Tokenizer max length + --text-encoder-use-attention-mask + Text encoder use attention mask + --rank RANK Rank + --xl XL + --mixed-precision MIXED_PRECISION + mixed precision, fp16, bf16, none + --validation-prompt VALIDATION_PROMPT + Validation prompt + --num-validation-images NUM_VALIDATION_IMAGES + Number of validation images + --validation-epochs VALIDATION_EPOCHS + Validation epochs + --checkpoints-total-limit CHECKPOINTS_TOTAL_LIMIT + Checkpoints total limit + --validation-images VALIDATION_IMAGES + Validation images + --logging Logging using tensorboard +``` \ No newline at end of file diff --git a/docs/source/faq.mdx b/docs/source/faq.mdx new file mode 100644 index 0000000000..abca494ca4 --- /dev/null +++ b/docs/source/faq.mdx @@ -0,0 +1,64 @@ +# Frequently Asked Questions + +## Are my data and models secure? + +Yes, your data and models are secure. AutoTrain uses the Hugging Face Hub to store your data and models. +All your data and models are uploaded to your Hugging Face account as private repositories and are only accessible by you. +Read more about security [here](https://huggingface.co/docs/hub/en/security). + +## Do you upload my data to the Hugging Face Hub? + +AutoTrain will not upload your dataset to the Hub if you are using the local backend or training in the same space. +AutoTrain will push your dataset to the Hub if you are using features like: DGX Cloud +or using local CLI to train on Hugging Face's infrastructure. + +You can safely remove the dataset from the Hub after training is complete. +If uploaded, the dataset will be stored in your Hugging Face account as a private repository and will only be accessible by you +and the training process. It is not used once the training is complete. + +## I get error `Your installed package nvidia-ml-py is corrupted. Skip patch functions` + +This error can be safely ignored. It is a warning from the `nvitop` library and does not affect the functionality of AutoTrain. + +## I get 409 conflict error when using the UI + +This error occurs when you try to create a project with the same name as an existing project. +To resolve this error, you can either delete the existing project or create a new project +with a different name. + +This error can also occur when you are trying to train a model while a model is already training in the same space or locally. + + +## The model I want to use doesn't show up in the model selection dropdown. + +If the model you want to use is not available in the model selection dropdown, +you can add it in the environment variable `AUTOTRAIN_CUSTOM_MODELS` in the space settings. +For example, if you want to add the `xxx/yyy` model, go to space settings, create a variable named `AUTOTRAIN_CUSTOM_MODELS` +and set the value to `xxx/yyy`. + +You can also pass the model name as query parameter in the URL. For example, if you want to use the `xxx/yyy` model, +you can use the URL `https://huggingface.co/spaces/your_autotrain_space?custom_models=xxx/yyy`. + +## How do I use AutoTrain locally? + +AutoTrain can be used locally by installing the AutoTrain Advanced pypi package. +You can read more in *Use AutoTrain Locally* section. + + +## Can I run AutoTrain on Colab? + +To start the UI on Colab, you can simply click on the following link: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/colabs/AutoTrain.ipynb) + +Please note, to run the app on Colab, you will need an ngrok token. You can get one by signing up for free on [ngrok](https://ngrok.com/). +This is because Colab does not allow exposing ports to the internet directly. + +To use the CLI instead on Colab, you can follow the same instructions as for using AutoTrain locally. + + +## Does AutoTrain have a docker image? + +Yes, AutoTrain has a docker image. +You can find the docker image on Docker Hub [here](https://hub.docker.com/r/huggingface/autotrain-advanced). + diff --git a/docs/source/getting_started.mdx b/docs/source/getting_started.bck similarity index 100% rename from docs/source/getting_started.mdx rename to docs/source/getting_started.bck diff --git a/docs/source/image_classification.mdx b/docs/source/image_classification.mdx index cb6dffb350..8bcf8a8034 100644 --- a/docs/source/image_classification.mdx +++ b/docs/source/image_classification.mdx @@ -1,11 +1,23 @@ # Image Classification -Image classification is a supervised learning problem: define a set of target classes (objects to identify in images), and train a model to recognize them using labeled example photos. -Using AutoTrain, its super-easy to train a state-of-the-art image classification model. Just upload a set of images, and AutoTrain will automatically train a model to classify them. +Image classification is a form of supervised learning where a model is trained to identify +and categorize objects within images. AutoTrain simplifies the process, enabling you to +train a state-of-the-art image classification model by simply uploading labeled example +images. -## Data Preparation -The data for image classification must be in zip format, with each class in a separate subfolder. For example, if you want to classify cats and dogs, your zip file should look like this: +## Preparing your data + +To ensure your image classification model trains effectively, follow these guidelines for preparing your data: + + +### Organizing Images + + +Prepare a zip file containing your categorized images. Each category should have its own +subfolder named after the class it represents. For example, to differentiate between +'cats' and 'dogs', your zip file structure should resemble the following: + ``` cats_and_dogs.zip @@ -21,100 +33,31 @@ cats_and_dogs.zip └── ... ``` +### Image Requirements + +- Format: Ensure all images are in JPEG, JPG, or PNG format. + +- Quantity: Include at least 5 images per class to provide the model with sufficient examples for learning. + +- Exclusivity: The zip file should exclusively contain folders named after the classes, +and these folders should only contain relevant images. No additional files or nested +folders should be included. + + +** Additional Tips** + +- Uniformity: While not required, having images of similar sizes and resolutions can help improve model performance. + +- Variability: Include a variety of images for each class to encompass the range of +appearances and contexts the model might encounter in real-world scenarios. + Some points to keep in mind: - The zip file should contain multiple folders (the classes), each folder should contain images of a single class. - The name of the folder should be the name of the class. - The images must be jpeg, jpg or png. - There should be at least 5 images per class. -- There should not be any other files in the zip file. -- There should not be any other folders inside the zip folder. +- There must not be any other files in the zip file. +- There must not be any other folders inside the zip folder. -When train.zip is decompressed, it creates two folders: cats and dogs. these are the two categories for classification. The images for both categories are in their respective folders. You can have as many categories as you want. - - -## Parameters - -``` -❯ autotrain image-classification --help -usage: autotrain [] image-classification [-h] [--train] [--deploy] [--inference] [--username USERNAME] - [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}] - [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME - [--data-path DATA_PATH] [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT] - [--batch-size BATCH_SIZE] [--seed SEED] [--epochs EPOCHS] - [--gradient_accumulation GRADIENT_ACCUMULATION] [--disable_gradient_checkpointing] - [--lr LR] [--log {none,wandb,tensorboard}] [--image-column IMAGE_COLUMN] - [--target-column TARGET_COLUMN] [--warmup-ratio WARMUP_RATIO] [--optimizer OPTIMIZER] - [--scheduler SCHEDULER] [--weight-decay WEIGHT_DECAY] [--max-grad-norm MAX_GRAD_NORM] - [--logging-steps LOGGING_STEPS] [--evaluation-strategy {steps,epoch,no}] - [--save-total-limit SAVE_TOTAL_LIMIT] - [--auto-find-batch-size] [--mixed-precision {fp16,bf16,None}] - -✨ Run AutoTrain Image Classification - -options: - -h, --help show this help message and exit - --train Command to train the model - --deploy Command to deploy the model (limited availability) - --inference Command to run inference (limited availability) - --username USERNAME Hugging Face Hub Username - --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf} - Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only. - --token TOKEN Your Hugging Face API token. Token must have write access to the model hub. - --push-to-hub Push to hub after training will push the trained model to the Hugging Face model hub. - --model MODEL Base model to use for training - --project-name PROJECT_NAME - Output directory / repo id for trained model (must be unique on hub) - --data-path DATA_PATH - Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate - formats - --train-split TRAIN_SPLIT - Train dataset split to use - --valid-split VALID_SPLIT - Validation dataset split to use - --batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --epochs EPOCHS Number of training epochs - --gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps - --disable_gradient_checkpointing - Disable gradient checkpointing - --lr LR Learning rate - --log {none,wandb,tensorboard} - Use experiment tracking - --image-column IMAGE_COLUMN - Image column to use - --target-column TARGET_COLUMN - Target column to use - --warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. - --optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. - --scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. - --weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. - --max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. - --logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. - --evaluation-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. - --save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. - --auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. - --mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` \ No newline at end of file +When train.zip is decompressed, it creates two folders: cats and dogs. these are the two categories for classification. The images for both categories are in their respective folders. You can have as many categories as you want. \ No newline at end of file diff --git a/docs/source/image_classification_params.mdx b/docs/source/image_classification_params.mdx new file mode 100644 index 0000000000..e6d8f3328a --- /dev/null +++ b/docs/source/image_classification_params.mdx @@ -0,0 +1,49 @@ +# Image Classification Parameters + +``` +--batch-size BATCH_SIZE + Training batch size to use +--seed SEED Random seed for reproducibility +--epochs EPOCHS Number of training epochs +--gradient_accumulation GRADIENT_ACCUMULATION + Gradient accumulation steps +--disable_gradient_checkpointing + Disable gradient checkpointing +--lr LR Learning rate +--log {none,wandb,tensorboard} + Use experiment tracking +--image-column IMAGE_COLUMN + Image column to use +--target-column TARGET_COLUMN + Target column to use +--warmup-ratio WARMUP_RATIO + Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help + in stabilizing the training process early on. Default ratio is 0.1. +--optimizer OPTIMIZER + Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model + performance. 'adamw_torch' is used by default. +--scheduler SCHEDULER + Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the + learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. +--weight-decay WEIGHT_DECAY + Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large + weights. Default is 0.0, meaning no weight decay is applied. +--max-grad-norm MAX_GRAD_NORM + Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient + problem in deep neural networks. Default is 1.0. +--logging-steps LOGGING_STEPS + Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging + steps automatically. Default is -1. +--evaluation-strategy {steps,epoch,no} + Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of + each training epoch by default. +--save-total-limit SAVE_TOTAL_LIMIT + Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. + Default is to save only the latest one. +--auto-find-batch-size + Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch + size that fits in memory. +--mixed-precision {fp16,bf16,None} + Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for + default precision. Default is None. +``` \ No newline at end of file diff --git a/docs/source/index.mdx b/docs/source/index.mdx index ca89078eea..cfe9d572ac 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -1,53 +1,48 @@ +# What is AutoTrain Advanced? + ![autotrain-homepage](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/autotrain_homepage.png) +🤗 AutoTrain Advanced (or simply AutoTrain), developed by Hugging Face, is a robust no-code +platform designed to simplify the process of training state-of-the-art models across +multiple domains: Natural Language Processing (NLP), Computer Vision (CV), +and even Tabular Data analysis. This tool leverages the powerful frameworks created by +various teams at Hugging Face, making advanced machine learning and artificial intelligence accessible to a broader +audience without requiring deep technical expertise. +# Who should use AutoTrain? -🤗 AutoTrain Advanced (or simply AutoTrain) is a no-code tool for training state-of-the-art models for Natural Language Processing (NLP) tasks, for Computer Vision (CV) tasks, and for Speech tasks and even for Tabular tasks. It is built on top of the awesome tools developed by the Hugging Face team, and it is designed to be easy to use. +AutoTrain is the perfect tool for anyone eager to dive into the world of machine learning +without getting bogged down by the complexities of model training. +Whether you're a business professional, researcher, educator, or hobbyist, +AutoTrain offers the simplicity of a no-code interface while still providing the +capabilities necessary to develop sophisticated models tailored to your unique datasets. -## Who should use AutoTrain? +AutoTrain is for anyone who wants to train a state-of-the-art model for a NLP, CV, Speech or even Tabular task, +but doesn't want to spend time on the technical details of training a model. -AutoTrain is for anyone who wants to train a state-of-the-art model for a NLP, CV, Speech or even Tabular task, but doesn't want to spend time on the technical details of training a model. -AutoTrain is also for anyone who wants to train a model for a custom dataset, but doesn't want to spend time on the technical details of training a model. -Our goal is to make it easy for anyone to train a state-of-the-art model for any task and our focus is not just data scientists or machine learning engineers, but also non-technical users. +Our mission is to democratize machine learning technology, ensuring it is not only +accessible to data scientists and ML engineers but also to those without a technical +background. If you're looking to harness the power of AI for your projects, +AutoTrain is your answer. -## How to use AutoTrain? +# How to use AutoTrain? We offer several ways to use AutoTrain: - No code users can use `AutoTrain Advanced` by creating a new space with AutoTrain Docker image: [Click here](https://huggingface.co/login?next=/spaces/autotrain-projects/autotrain-advanced?duplicate=true) to create AutoTrain Space. -Please make sure you keep the space private and attach appropriate hardware to the space. - -- Developers can access and build on top of AutoTrain using python api or run AutoTrain Advanced UI locally. -The python api is available in the `autotrain-advanced` package. - - -You can install it using pip: - -```bash -$ pip install autotrain-advanced -``` - -# Running AutoTrain Locally - -To run the autotrain app locally, you can use the following command: +Remember to keep your space private and ensure it is equipped with the necessary hardware resources (GPU) for optimal performance. -```bash -$ export HF_TOKEN=your_hugging_face_write_token -$ autotrain app --host 127.0.0.1 --port 8000 -``` +- If you prefer a more hands-on approach, AutoTrain Advanced can also be run locally +through its intuitive UI or accessed via the Python API provided in the autotrain-advanced +package. This flexibility allows developers to integrate AutoTrain capabilities directly +into their projects, customize workflows, and enhance their toolsets with advanced machine +learning functionalities. -This will start the app on `http://127.0.0.1:8000`. -Its advised to install autotrain-advanced in a virtual environment to avoid any conflicts with other packages. +By bridging the gap between cutting-edge technology and practical usability, +AutoTrain Advanced empowers users to achieve remarkable results in AI without the need +for extensive programming knowledge. Start your journey with AutoTrain today and unlock +the potential of machine learning for your projects! -```bash -$ conda create -n autotrain python=3.10 -$ conda activate autotrain -$ pip install autotrain-advanced -$ conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -$ conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc -$ export HF_TOKEN=your_hugging_face_write_token -$ autotrain app --host 127.0.0.1 --port 8000 -``` diff --git a/docs/source/llm_finetuning.mdx b/docs/source/llm_finetuning.mdx index 3597dba7dd..b5bfeddeef 100644 --- a/docs/source/llm_finetuning.mdx +++ b/docs/source/llm_finetuning.mdx @@ -58,141 +58,3 @@ For DPO Trainer, your dataset must have a `prompt` column, a `text` column (aka For all tasks, you can use both CSV and JSONL files! - - -## Parameters - -``` -❯ autotrain llm --help -usage: autotrain [] llm [-h] [--train] [--deploy] [--inference] [--username USERNAME] - [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}] - [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME [--data-path DATA_PATH] - [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT] [--batch-size BATCH_SIZE] [--seed SEED] - [--epochs EPOCHS] [--gradient_accumulation GRADIENT_ACCUMULATION] [--disable_gradient_checkpointing] - [--lr LR] [--log {none,wandb,tensorboard}] [--text_column TEXT_COLUMN] - [--rejected_text_column REJECTED_TEXT_COLUMN] [--prompt-text-column PROMPT_TEXT_COLUMN] - [--model-ref MODEL_REF] [--warmup_ratio WARMUP_RATIO] [--optimizer OPTIMIZER] [--scheduler SCHEDULER] - [--weight_decay WEIGHT_DECAY] [--max_grad_norm MAX_GRAD_NORM] [--add_eos_token] [--block_size BLOCK_SIZE] - [--peft] [--lora_r LORA_R] [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] - [--logging_steps LOGGING_STEPS] [--evaluation_strategy {epoch,steps,no}] - [--save_total_limit SAVE_TOTAL_LIMIT] [--auto_find_batch_size] - [--mixed_precision {fp16,bf16,None}] [--quantization {int4,int8,None}] [--model_max_length MODEL_MAX_LENGTH] - [--max_prompt_length MAX_PROMPT_LENGTH] [--max_completion_length MAX_COMPLETION_LENGTH] - [--trainer {default,dpo,sft,orpo,reward}] [--target_modules TARGET_MODULES] [--merge_adapter] - [--use_flash_attention_2] [--dpo-beta DPO_BETA] [--chat_template {tokenizer,chatml,zephyr,None}] - [--padding {left,right,None}] - -✨ Run AutoTrain LLM - -options: - -h, --help show this help message and exit - --train Command to train the model - --deploy Command to deploy the model (limited availability) - --inference Command to run inference (limited availability) - --username USERNAME Hugging Face Hub Username - --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf} - Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only. - --token TOKEN Your Hugging Face API token. Token must have write access to the model hub. - --push-to-hub Push to hub after training will push the trained model to the Hugging Face model hub. - --model MODEL Base model to use for training - --project-name PROJECT_NAME - Output directory / repo id for trained model (must be unique on hub) - --data-path DATA_PATH - Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate - formats - --train-split TRAIN_SPLIT - Train dataset split to use - --valid-split VALID_SPLIT - Validation dataset split to use - --batch-size BATCH_SIZE, --train-batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --epochs EPOCHS Number of training epochs - --gradient_accumulation GRADIENT_ACCUMULATION, --gradient-accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps - --disable_gradient_checkpointing, --disable-gradient-checkpointing, --disable-gc - Disable gradient checkpointing - --lr LR Learning rate - --log {none,wandb,tensorboard} - Use experiment tracking - --text_column TEXT_COLUMN, --text-column TEXT_COLUMN - Specify the dataset column to use for text data. This parameter is essential for models processing textual information. - Default is 'text'. - --rejected_text_column REJECTED_TEXT_COLUMN, --rejected-text-column REJECTED_TEXT_COLUMN - Define the column to use for storing rejected text entries, which are typically entries that do not meet certain criteria - for processing. Default is 'rejected'. Used only for orpo, dpo and reward trainerss - --prompt-text-column PROMPT_TEXT_COLUMN, --prompt-text-column PROMPT_TEXT_COLUMN - Identify the column that contains prompt text for tasks requiring contextual inputs, such as conversation or completion - generation. Default is 'prompt'. Used only for dpo trainer - --model-ref MODEL_REF - Reference model to use for DPO when not using PEFT - --warmup_ratio WARMUP_RATIO, --warmup-ratio WARMUP_RATIO - Set the proportion of training allocated to warming up the learning rate, which can enhance model stability and performance - at the start of training. Default is 0.1 - --optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. - --scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. - --weight_decay WEIGHT_DECAY, --weight-decay WEIGHT_DECAY - Define the weight decay rate for regularization, which helps prevent overfitting by penalizing larger weights. Default is - 0.0 - --max_grad_norm MAX_GRAD_NORM, --max-grad-norm MAX_GRAD_NORM - Set the maximum norm for gradient clipping, which is critical for preventing gradients from exploding during - backpropagation. Default is 1.0. - --add_eos_token, --add-eos-token - Toggle whether to automatically add an End Of Sentence (EOS) token at the end of texts, which can be critical for certain - types of models like language models. Only used for `default` trainer - --block_size BLOCK_SIZE, --block-size BLOCK_SIZE - Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to - -1 determines block size automatically. Default is -1. - --peft, --use-peft Enable LoRA-PEFT - --lora_r LORA_R, --lora-r LORA_R - Set the 'r' parameter for Low-Rank Adaptation (LoRA). Default is 16. - --lora_alpha LORA_ALPHA, --lora-alpha LORA_ALPHA - Specify the 'alpha' parameter for LoRA. Default is 32. - --lora_dropout LORA_DROPOUT, --lora-dropout LORA_DROPOUT - Set the dropout rate within the LoRA layers to help prevent overfitting during adaptation. Default is 0.05. - --logging_steps LOGGING_STEPS, --logging-steps LOGGING_STEPS - Determine how often to log training progress in terms of steps. Setting it to '-1' determines logging steps automatically. - --evaluation_strategy {epoch,steps,no}, --evaluation-strategy {epoch,steps,no} - Choose how frequently to evaluate the model's performance, with 'epoch' as the default, meaning at the end of each training - epoch - --save_total_limit SAVE_TOTAL_LIMIT, --save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of saved model checkpoints to manage disk usage effectively. Default is to save only the latest - checkpoint - --auto_find_batch_size, --auto-find-batch-size - Automatically determine the optimal batch size based on system capabilities to maximize efficiency. - --mixed_precision {fp16,bf16,None}, --mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. - --quantization {int4,int8,None}, --quantization {int4,int8,None} - Choose the quantization level to reduce model size and potentially increase inference speed. Options include 'int4', 'int8', - or None. Enabling requires --peft - --model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH - Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. - Default is 1024 - --max_prompt_length MAX_PROMPT_LENGTH, --max-prompt-length MAX_PROMPT_LENGTH - Specify the maximum length for prompts used in training, particularly relevant for tasks requiring initial contextual input. - Used only for `orpo` trainer. - --max_completion_length MAX_COMPLETION_LENGTH, --max-completion-length MAX_COMPLETION_LENGTH - Completion length to use, for orpo: encoder-decoder models only - --trainer {default,dpo,sft,orpo,reward} - Trainer type to use - --target_modules TARGET_MODULES, --target-modules TARGET_MODULES - Identify specific modules within the model architecture to target with adaptations or optimizations, such as LoRA. Comma - separated list of module names. Default is 'all-linear'. - --merge_adapter, --merge-adapter - Use this flag to merge PEFT adapter with the model - --use_flash_attention_2, --use-flash-attention-2, --use-fa2 - Use flash attention 2 - --dpo-beta DPO_BETA, --dpo-beta DPO_BETA - Beta for DPO trainer - --chat_template {tokenizer,chatml,zephyr,None}, --chat-template {tokenizer,chatml,zephyr,None} - Apply a specific template for chat-based interactions, with options including 'tokenizer', 'chatml', 'zephyr', or None. This - setting can shape the model's conversational behavior. - --padding {left,right,None}, --padding {left,right,None} - Specify the padding direction for sequences, critical for models sensitive to input alignment. Options include 'left', - 'right', or None -``` \ No newline at end of file diff --git a/docs/source/llm_finetuning_params.mdx b/docs/source/llm_finetuning_params.mdx new file mode 100644 index 0000000000..2c499c4f3a --- /dev/null +++ b/docs/source/llm_finetuning_params.mdx @@ -0,0 +1,86 @@ +## LLM Fine Tuning Parameters + +``` +--batch-size BATCH_SIZE, --train-batch-size BATCH_SIZE + Training batch size to use +--seed SEED Random seed for reproducibility +--epochs EPOCHS Number of training epochs +--gradient_accumulation GRADIENT_ACCUMULATION, --gradient-accumulation GRADIENT_ACCUMULATION + Gradient accumulation steps +--disable_gradient_checkpointing, --disable-gradient-checkpointing, --disable-gc + Disable gradient checkpointing +--lr LR Learning rate +--log {none,wandb,tensorboard} + Use experiment tracking +--model-ref MODEL_REF + Reference model to use for DPO when not using PEFT +--warmup_ratio WARMUP_RATIO, --warmup-ratio WARMUP_RATIO + Set the proportion of training allocated to warming up the learning rate, which can enhance model stability and performance + at the start of training. Default is 0.1 +--optimizer OPTIMIZER + Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model + performance. 'adamw_torch' is used by default. +--scheduler SCHEDULER + Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the + learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. +--weight_decay WEIGHT_DECAY, --weight-decay WEIGHT_DECAY + Define the weight decay rate for regularization, which helps prevent overfitting by penalizing larger weights. Default is + 0.0 +--max_grad_norm MAX_GRAD_NORM, --max-grad-norm MAX_GRAD_NORM + Set the maximum norm for gradient clipping, which is critical for preventing gradients from exploding during + backpropagation. Default is 1.0. +--add_eos_token, --add-eos-token + Toggle whether to automatically add an End Of Sentence (EOS) token at the end of texts, which can be critical for certain + types of models like language models. Only used for `default` trainer +--block_size BLOCK_SIZE, --block-size BLOCK_SIZE + Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to + -1 determines block size automatically. Default is -1. +--peft, --use-peft Enable LoRA-PEFT +--lora_r LORA_R, --lora-r LORA_R + Set the 'r' parameter for Low-Rank Adaptation (LoRA). Default is 16. +--lora_alpha LORA_ALPHA, --lora-alpha LORA_ALPHA + Specify the 'alpha' parameter for LoRA. Default is 32. +--lora_dropout LORA_DROPOUT, --lora-dropout LORA_DROPOUT + Set the dropout rate within the LoRA layers to help prevent overfitting during adaptation. Default is 0.05. +--logging_steps LOGGING_STEPS, --logging-steps LOGGING_STEPS + Determine how often to log training progress in terms of steps. Setting it to '-1' determines logging steps automatically. +--evaluation_strategy {epoch,steps,no}, --evaluation-strategy {epoch,steps,no} + Choose how frequently to evaluate the model's performance, with 'epoch' as the default, meaning at the end of each training + epoch +--save_total_limit SAVE_TOTAL_LIMIT, --save-total-limit SAVE_TOTAL_LIMIT + Limit the total number of saved model checkpoints to manage disk usage effectively. Default is to save only the latest + checkpoint +--auto_find_batch_size, --auto-find-batch-size + Automatically determine the optimal batch size based on system capabilities to maximize efficiency. +--mixed_precision {fp16,bf16,None}, --mixed-precision {fp16,bf16,None} + Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for + default precision. Default is None. +--quantization {int4,int8,None}, --quantization {int4,int8,None} + Choose the quantization level to reduce model size and potentially increase inference speed. Options include 'int4', 'int8', + or None. Enabling requires --peft +--model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH + Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. + Default is 1024 +--max_prompt_length MAX_PROMPT_LENGTH, --max-prompt-length MAX_PROMPT_LENGTH + Specify the maximum length for prompts used in training, particularly relevant for tasks requiring initial contextual input. + Used only for `orpo` trainer. +--max_completion_length MAX_COMPLETION_LENGTH, --max-completion-length MAX_COMPLETION_LENGTH + Completion length to use, for orpo: encoder-decoder models only +--trainer {default,dpo,sft,orpo,reward} + Trainer type to use +--target_modules TARGET_MODULES, --target-modules TARGET_MODULES + Identify specific modules within the model architecture to target with adaptations or optimizations, such as LoRA. Comma + separated list of module names. Default is 'all-linear'. +--merge_adapter, --merge-adapter + Use this flag to merge PEFT adapter with the model +--use_flash_attention_2, --use-flash-attention-2, --use-fa2 + Use flash attention 2 +--dpo-beta DPO_BETA, --dpo-beta DPO_BETA + Beta for DPO trainer +--chat_template {tokenizer,chatml,zephyr,None}, --chat-template {tokenizer,chatml,zephyr,None} + Apply a specific template for chat-based interactions, with options including 'tokenizer', 'chatml', 'zephyr', or None. This + setting can shape the model's conversational behavior. +--padding {left,right,None}, --padding {left,right,None} + Specify the padding direction for sequences, critical for models sensitive to input alignment. Options include 'left', + 'right', or None +``` \ No newline at end of file diff --git a/docs/source/starting_cli.mdx b/docs/source/quickstart.mdx similarity index 56% rename from docs/source/starting_cli.mdx rename to docs/source/quickstart.mdx index 5a22d75d8a..738078c184 100644 --- a/docs/source/starting_cli.mdx +++ b/docs/source/quickstart.mdx @@ -1,38 +1,46 @@ -# Starting the UI +# Quickstart -To run the autotrain cli locally or in colab, install autotrain-advanced python package: +This quickstart is for local installation and usage. +If you want to use AutoTrain on Hugging Face Spaces, please refer to the *AutoTrain on Hugging Face Spaces* section. -```bash -$ pip install autotrain-advanced -``` - -and then run the following command: +You can install AutoTrain Advanced using pip: ```bash -$ export HF_TOKEN=your_hugging_face_write_token -$ autotrain --help +$ pip install autotrain-advanced ``` -This will start the app on `http://127.0.0.1:8000`. - -AutoTrain doesn't install pytorch, torchaudio, torchvision, or any other dependencies. You will need to install them separately. -It is thus recommended to use conda environment: - +It is advised to install autotrain-advanced in a virtual environment to avoid any conflicts with other packages. +Note: AutoTrain doesn't install pytorch, torchaudio, torchvision, or any other large dependencies. You will need to install them separately. ```bash $ conda create -n autotrain python=3.10 $ conda activate autotrain - $ pip install autotrain-advanced - $ conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia $ conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc $ conda install xformers -c xformers - $ python -m nltk.downloader punkt -$ pip install flash-attn --no-build-isolation -$ pip install deepspeed +$ pip install flash-attn --no-build-isolation # if you want to use flash-attn +$ pip install deepspeed # if you want to use deepspeed +```` + +# Running AutoTrain User Interface (UI) +To run the autotrain app locally, you can use the following command: + +```bash +$ export HF_TOKEN=your_hugging_face_write_token +$ autotrain app --host 127.0.0.1 --port 8000 +``` + +This will start the app on `http://127.0.0.1:8000`. + + +# Using AutoTrain Command Line Interface (CLI) + +It is also possible to use the CLI: + +```bash $ export HF_TOKEN=your_hugging_face_write_token $ autotrain --help ``` @@ -40,7 +48,6 @@ $ autotrain --help This will show the CLI commands that can be used: ```bash -$ autotrain --help usage: autotrain [] positional arguments: @@ -51,6 +58,7 @@ positional arguments: dreambooth, api, text-classification, + text-regression, image-classification, tabular, spacerunner, @@ -63,21 +71,24 @@ positional arguments: options: -h, --help show this help message and exit --version, -v Display AutoTrain version + --config CONFIG Optional configuration file For more information about a command, run: `autotrain --help` ``` +It is advised to use `autotrain --config CONFIG_FILE` command when using the CLI. + The autotrain commands that end users will be interested in are: - `app`: Start the AutoTrain UI - `llm`: Train a language model - `dreambooth`: Train a model using DreamBooth - `text-classification`: Train a text classification model +- `text-regression`: Train a text regression model - `image-classification`: Train an image classification model - `tabular`: Train a tabular model - `spacerunner`: Train any custom model using SpaceRunner - `seq2seq`: Train a sequence-to-sequence model - `token-classification`: Train a token classification model - -In case of any issues, please report on the [GitHub issues](https://github.com/huggingface/autotrain-advanced/). +Note: above commands are not required if you use preferred `autotrain --config CONFIG_FILE` command to train the models. \ No newline at end of file diff --git a/docs/source/quickstart_spaces.mdx b/docs/source/quickstart_spaces.mdx new file mode 100644 index 0000000000..52f8725e0a --- /dev/null +++ b/docs/source/quickstart_spaces.mdx @@ -0,0 +1,78 @@ +# Quickstart Guide to AutoTrain on Hugging Face Spaces + +AutoTrain on Hugging Face Spaces is the preferred choice for a streamlined experience in +model training. This platform is optimized for ease of use, with pre-installed dependencies +and managed hardware resources. AutoTrain on Hugging Face Spaces can be used both by +no-code users and developers, making it versatile for various levels of expertise. + + +## Creating a New AutoTrain Space + +Getting started with AutoTrain is straightforward. Here’s how you can create your new space: + +1. **Visit the AutoTrain Page**: To create a new space with AutoTrain Docker image, all you need to do is go +to [AutoTrain Homepage](https://hf.co/autotrain) and click on "Create new project". + +2. **Log In or View the Setup Screen**: If not logged in, you'll be prompted to do so. Then, you’ll see a screen similar to this: + +![autotrain-duplicate-space](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/duplicate_space.png) + +3. **Set Up Your Space**: + +- **Choose a Space Name**: Name your space something relevant to your project. + +- **Allocate Hardware Resources**: Select the necessary computational resources based on your project needs. + +- **Duplicate Space**: Click on "Duplicate Space" to initiate your AutoTrain space with the Docker image. + +4. **Configuration Options**: + +- PAUSE_ON_FAILURE: Set this to 0 if you prefer the space not to pause on training failures, useful for running continuous experiments. This option can also be used if you continuously want to perfom many experiments in the same space. + +5. **Launch and Train**: + +- Once done, in a few seconds, the AutoTrain Space will be up and running and you will be presented with the following screen: + +![autotrain-space](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/autotrain_space.png) + +- From here, you can select tasks, upload datasets, choose models, adjust hyperparameters (if needed), +and start the training process directly within the space. + +- The space will manage its own activity, shutting down post-training unless configured +otherwise based on the `PAUSE_ON_FAILURE` setting. + +6. **Monitoring Progress**: + +- All training logs and progress can be monitored via TensorBoard, accessible under +`username/project_name` on the Hugging Face Hub. + +- Once training concludes successfully, you’ll find the model files in the same repository. + +7. **Navigating the UI**: + +- If you need help understanding any UI elements, click on the small (i) information icons for detailed descriptions. + +If you are confused about the UI elements, click on the small (i) information icon to get more information about the UI element. + +For data formats and detailed parameter information, please see the Data Formats and Parameters section where we provide +example datasets and detailed information about the parameters for each task supported by AutoTrain. + +## Ensuring Your AutoTrain is Up-to-Date + +We are constantly adding new features and tasks to AutoTrain Advanced. To benefit from the latest features, tasks, and bug fixes, update your AutoTrain space regularly: + +- *Factory Reboot*: Navigate to the settings page of your space and click on "Factory reboot" to upgrade to the latest version of AutoTrain Advanced. + +![autotrain-space-template](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/space_template_5.png) + +- *Note*: Simply "restarting" the space does not update it; a factory reboot is necessary for a complete update. + + +For additional details on data formats and specific parameters, refer to the +'Data Formats and Parameters' section where we provide example datasets and extensive +parameter information for each supported task by AutoTrain. + + +With these steps, you can effortlessly initiate and manage your AutoTrain projects on +Hugging Face Spaces, leveraging the platform's robust capabilities for your machine learning and AI +needs. diff --git a/docs/source/seq2seq.mdx b/docs/source/seq2seq.mdx index 6daf43999c..e8505cdbcc 100644 --- a/docs/source/seq2seq.mdx +++ b/docs/source/seq2seq.mdx @@ -30,111 +30,3 @@ Or as a JSONL file: ## Columns Your CSV/JSONL dataset must have two columns: `text` and `target`. - -## Parameters - -``` -❯ autotrain seq2seq --help -usage: autotrain [] seq2seq [-h] [--train] [--deploy] [--inference] [--username USERNAME] - [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}] - [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME [--data-path DATA_PATH] - [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT] [--batch-size BATCH_SIZE] [--seed SEED] - [--epochs EPOCHS] [--gradient_accumulation GRADIENT_ACCUMULATION] [--disable_gradient_checkpointing] - [--lr LR] [--log {none,wandb,tensorboard}] [--text-column TEXT_COLUMN] [--target-column TARGET_COLUMN] - [--max-seq-length MAX_SEQ_LENGTH] [--max-target-length MAX_TARGET_LENGTH] [--warmup-ratio WARMUP_RATIO] - [--optimizer OPTIMIZER] [--scheduler SCHEDULER] [--weight-decay WEIGHT_DECAY] - [--max-grad-norm MAX_GRAD_NORM] [--logging-steps LOGGING_STEPS] - [--evaluation-strategy EVALUATION_STRATEGY] [--save-total-limit SAVE_TOTAL_LIMIT] - [--auto-find-batch-size] [--mixed-precision {fp16,bf16,None}] [--peft] - [--quantization {int8,None}] [--lora-r LORA_R] [--lora-alpha LORA_ALPHA] [--lora-dropout LORA_DROPOUT] - [--target-modules TARGET_MODULES] - -✨ Run AutoTrain Seq2Seq - -options: - -h, --help show this help message and exit - --train Command to train the model - --deploy Command to deploy the model (limited availability) - --inference Command to run inference (limited availability) - --username USERNAME Hugging Face Hub Username - --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf} - Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only. - --token TOKEN Your Hugging Face API token. Token must have write access to the model hub. - --push-to-hub Push to hub after training will push the trained model to the Hugging Face model hub. - --model MODEL Base model to use for training - --project-name PROJECT_NAME - Output directory / repo id for trained model (must be unique on hub) - --data-path DATA_PATH - Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate - formats - --train-split TRAIN_SPLIT - Train dataset split to use - --valid-split VALID_SPLIT - Validation dataset split to use - --batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --epochs EPOCHS Number of training epochs - --gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps - --disable_gradient_checkpointing - Disable gradient checkpointing - --lr LR Learning rate - --log {none,wandb,tensorboard} - Use experiment tracking - --text-column TEXT_COLUMN - Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields. - Default is 'text'. - --target-column TARGET_COLUMN - Specify the column name that holds the target data for training. Helps in distinguishing different potential outputs. - Default is 'target'. - --max-seq-length MAX_SEQ_LENGTH - Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are - truncated. Affects both memory usage and computational requirements. Default is 128 tokens. - --max-target-length MAX_TARGET_LENGTH - Define the maximum number of tokens for the target sequence in each input. Useful for models that generate outputs, ensuring - uniformity in sequence length. Default is set to 128 tokens. - --warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. - --optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. - --scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. - --weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. - --max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. - --logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. - --evaluation-strategy EVALUATION_STRATEGY - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. - --save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. - --auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. - --mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. - --peft Enable LoRA-PEFT - --quantization {int8,None} - Select the quantization mode to reduce model size and potentially increase inference speed. Options include 'int8' for 8-bit - integer quantization or None for no quantization. Default is None - --lora-r LORA_R Set the rank 'R' for the LoRA (Low-Rank Adaptation) technique. Default is 16. - --lora-alpha LORA_ALPHA - Specify the 'Alpha' parameter for LoRA. Default is 32. - --lora-dropout LORA_DROPOUT - Determine the dropout rate to apply in the LoRA layers, which can help in preventing overfitting by randomly disabling a - fraction of neurons during training. Default rate is 0.05. - --target-modules TARGET_MODULES - List the modules within the model architecture that should be targeted for specific techniques such as LoRA adaptations. - Useful for fine-tuning particular components of large models. By default all linear layers are targeted. -``` diff --git a/docs/source/seq2seq_params.mdx b/docs/source/seq2seq_params.mdx new file mode 100644 index 0000000000..eb23678e8a --- /dev/null +++ b/docs/source/seq2seq_params.mdx @@ -0,0 +1,70 @@ +# Seq2Seq Parameters + +``` +--batch-size BATCH_SIZE + Training batch size to use +--seed SEED Random seed for reproducibility +--epochs EPOCHS Number of training epochs +--gradient_accumulation GRADIENT_ACCUMULATION + Gradient accumulation steps +--disable_gradient_checkpointing + Disable gradient checkpointing +--lr LR Learning rate +--log {none,wandb,tensorboard} + Use experiment tracking +--text-column TEXT_COLUMN + Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields. + Default is 'text'. +--target-column TARGET_COLUMN + Specify the column name that holds the target data for training. Helps in distinguishing different potential outputs. + Default is 'target'. +--max-seq-length MAX_SEQ_LENGTH + Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are + truncated. Affects both memory usage and computational requirements. Default is 128 tokens. +--max-target-length MAX_TARGET_LENGTH + Define the maximum number of tokens for the target sequence in each input. Useful for models that generate outputs, ensuring + uniformity in sequence length. Default is set to 128 tokens. +--warmup-ratio WARMUP_RATIO + Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help + in stabilizing the training process early on. Default ratio is 0.1. +--optimizer OPTIMIZER + Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model + performance. 'adamw_torch' is used by default. +--scheduler SCHEDULER + Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the + learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. +--weight-decay WEIGHT_DECAY + Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large + weights. Default is 0.0, meaning no weight decay is applied. +--max-grad-norm MAX_GRAD_NORM + Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient + problem in deep neural networks. Default is 1.0. +--logging-steps LOGGING_STEPS + Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging + steps automatically. Default is -1. +--evaluation-strategy EVALUATION_STRATEGY + Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of + each training epoch by default. +--save-total-limit SAVE_TOTAL_LIMIT + Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. + Default is to save only the latest one. +--auto-find-batch-size + Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch + size that fits in memory. +--mixed-precision {fp16,bf16,None} + Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for + default precision. Default is None. +--peft Enable LoRA-PEFT +--quantization {int8,None} + Select the quantization mode to reduce model size and potentially increase inference speed. Options include 'int8' for 8-bit + integer quantization or None for no quantization. Default is None +--lora-r LORA_R Set the rank 'R' for the LoRA (Low-Rank Adaptation) technique. Default is 16. +--lora-alpha LORA_ALPHA + Specify the 'Alpha' parameter for LoRA. Default is 32. +--lora-dropout LORA_DROPOUT + Determine the dropout rate to apply in the LoRA layers, which can help in preventing overfitting by randomly disabling a + fraction of neurons during training. Default rate is 0.05. +--target-modules TARGET_MODULES + List the modules within the model architecture that should be targeted for specific techniques such as LoRA adaptations. + Useful for fine-tuning particular components of large models. By default all linear layers are targeted. +``` diff --git a/docs/source/starting_ui.mdx b/docs/source/starting_ui.bck similarity index 100% rename from docs/source/starting_ui.mdx rename to docs/source/starting_ui.bck diff --git a/docs/source/support.mdx b/docs/source/support.mdx index ae68375419..d52733d21b 100644 --- a/docs/source/support.mdx +++ b/docs/source/support.mdx @@ -1,12 +1,31 @@ # Help and Support -To get help and support for autotrain, there are 3 ways: +If you need assistance with AutoTrain Advanced or have questions about your projects, +you can reach out through several dedicated support channels. We're here to help you +navigate any issues you encounter, from technical queries to billing concerns. +Below are the best ways to get support: -- [Create an issue](https://github.com/huggingface/autotrain-advanced/issues/new) in AutoTrain Advanced GitHub repository. -- [Ask in the Hugging Face Forum](https://discuss.huggingface.co/c/autotrain/16). +- For technical support or to report a bug, you can [create an issue](https://github.com/huggingface/autotrain-advanced/issues/new) +directly in the AutoTrain Advanced GitHub repository. GitHub repo is ideal for tracking bugs, +requesting features, or getting help with troubleshooting problems. When submitting an +issue, please include all the details in question to help us provide the most +relevant support quickly. -- [Email us](mailto:autotrain@hf.co) directly (Enterprise users and billing questions only). +- [Ask in the Hugging Face Forum](https://discuss.huggingface.co/c/autotrain/16). This space is perfect for asking questions, +sharing your experiences, or discussing AutoTrain with other users and the Hugging Face +team. The forum is a great resource for getting advice, learning best practices, and +connecting with other machine learning practitioners. +- For enterprise users or specific inquiries related to billing, please [email us](mailto:autotrain@hf.co) directly. +This channel ensures that your more sensitive or account-specific issues are handled +appropriately and confidentially. When emailing, please provide your username and +project name so we can assist you efficiently. -Please don't forget to mention your username and project name if you have a specific question about your project. +Please note: e-mail support is only available for pro/enterprise users or those with specific queries about billing. + + +By utilizing these support channels, you can ensure that any hurdles you face while using +AutoTrain Advanced are addressed promptly, allowing you to focus on achieving your project +goals. Whether you're a beginner or an experienced user, we are here to support your +journey in AI model training. diff --git a/docs/source/tabular.mdx b/docs/source/tabular.mdx index 2acc9d338f..45374c9bf5 100644 --- a/docs/source/tabular.mdx +++ b/docs/source/tabular.mdx @@ -42,75 +42,3 @@ id,category1,category2,feature1,target ## Columns Your CSV dataset must have two columns: `id` and `target`. - - -## Parameters - -``` -❯ autotrain tabular --help -usage: autotrain [] tabular [-h] [--train] [--deploy] [--inference] [--username USERNAME] - [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}] - [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME [--data-path DATA_PATH] - [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT] [--batch-size BATCH_SIZE] [--seed SEED] - --target-columns TARGET_COLUMNS [--categorical-columns CATEGORICAL_COLUMNS] - [--numerical-columns NUMERICAL_COLUMNS] --id-column ID_COLUMN --task {classification,regression} - [--num-trials NUM_TRIALS] [--time-limit TIME_LIMIT] [--categorical-imputer {most_frequent,None}] - [--numerical-imputer {mean,median,None}] [--numeric-scaler {standard,minmax,normal,robust}] - -✨ Run AutoTrain Tabular Data Training - -options: - -h, --help show this help message and exit - --train Command to train the model - --deploy Command to deploy the model (limited availability) - --inference Command to run inference (limited availability) - --username USERNAME Hugging Face Hub Username - --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf} - Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only. - --token TOKEN Your Hugging Face API token. Token must have write access to the model hub. - --push-to-hub Push to hub after training will push the trained model to the Hugging Face model hub. - --model MODEL Base model to use for training - --project-name PROJECT_NAME - Output directory / repo id for trained model (must be unique on hub) - --data-path DATA_PATH - Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate - formats - --train-split TRAIN_SPLIT - Train dataset split to use - --valid-split VALID_SPLIT - Validation dataset split to use - --batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --target-columns TARGET_COLUMNS - Specify the names of the target or label columns separated by commas if multiple. These columns are what the model will - predict. Required for defining the output of the model. - --categorical-columns CATEGORICAL_COLUMNS - List the names of columns that contain categorical data, useful for models that need explicit handling of such data. - Categorical data is typically processed differently from numerical data, such as through encoding. If not specified, the - model will infer the data type. - --numerical-columns NUMERICAL_COLUMNS - Identify columns that contain numerical data. Proper specification helps in applying appropriate scaling and normalization - techniques, which can significantly impact model performance. If not specified, the model will infer the data type. - --id-column ID_COLUMN - Specify the column name that uniquely identifies each row in the dataset. This is critical for tracking samples through the - model pipeline and is often excluded from model training. Required field. - --task {classification,regression} - Define the type of machine learning task, such as 'classification', 'regression'. This parameter determines the model's - architecture and the loss function to use. Required to properly configure the model. - --num-trials NUM_TRIALS - Set the number of trials for hyperparameter tuning or model experimentation. More trials can lead to better model - configurations but require more computational resources. Default is 100 trials. - --time-limit TIME_LIMIT - mpose a time limit (in seconds) for training or searching for the best model configuration. This helps manage resource - allocation and ensures the process does not exceed available computational budgets. The default is 3600 seconds (1 hour). - --categorical-imputer {most_frequent,None} - Select the method or strategy to impute missing values in categorical columns. Options might include 'most_frequent', - 'None'. Correct imputation can prevent biases and improve model accuracy. - --numerical-imputer {mean,median,None} - Choose the imputation strategy for missing values in numerical columns. Common strategies include 'mean', & 'median'. - Accurate imputation is vital for maintaining the integrity of numerical data. - --numeric-scaler {standard,minmax,normal,robust} - Determine the type of scaling to apply to numerical data. Examples include 'standard' (zero mean and unit variance), 'min- - max' (scaled between given range), etc. Scaling is essential for many algorithms to perform optimally -``` diff --git a/docs/source/tabular_params.mdx b/docs/source/tabular_params.mdx new file mode 100644 index 0000000000..a2dd7fd1b6 --- /dev/null +++ b/docs/source/tabular_params.mdx @@ -0,0 +1,38 @@ +# Tabular Parameters + +``` +--batch-size BATCH_SIZE + Training batch size to use +--seed SEED Random seed for reproducibility +--target-columns TARGET_COLUMNS + Specify the names of the target or label columns separated by commas if multiple. These columns are what the model will + predict. Required for defining the output of the model. +--categorical-columns CATEGORICAL_COLUMNS + List the names of columns that contain categorical data, useful for models that need explicit handling of such data. + Categorical data is typically processed differently from numerical data, such as through encoding. If not specified, the + model will infer the data type. +--numerical-columns NUMERICAL_COLUMNS + Identify columns that contain numerical data. Proper specification helps in applying appropriate scaling and normalization + techniques, which can significantly impact model performance. If not specified, the model will infer the data type. +--id-column ID_COLUMN + Specify the column name that uniquely identifies each row in the dataset. This is critical for tracking samples through the + model pipeline and is often excluded from model training. Required field. +--task {classification,regression} + Define the type of machine learning task, such as 'classification', 'regression'. This parameter determines the model's + architecture and the loss function to use. Required to properly configure the model. +--num-trials NUM_TRIALS + Set the number of trials for hyperparameter tuning or model experimentation. More trials can lead to better model + configurations but require more computational resources. Default is 100 trials. +--time-limit TIME_LIMIT + mpose a time limit (in seconds) for training or searching for the best model configuration. This helps manage resource + allocation and ensures the process does not exceed available computational budgets. The default is 3600 seconds (1 hour). +--categorical-imputer {most_frequent,None} + Select the method or strategy to impute missing values in categorical columns. Options might include 'most_frequent', + 'None'. Correct imputation can prevent biases and improve model accuracy. +--numerical-imputer {mean,median,None} + Choose the imputation strategy for missing values in numerical columns. Common strategies include 'mean', & 'median'. + Accurate imputation is vital for maintaining the integrity of numerical data. +--numeric-scaler {standard,minmax,normal,robust} + Determine the type of scaling to apply to numerical data. Examples include 'standard' (zero mean and unit variance), 'min- + max' (scaled between given range), etc. Scaling is essential for many algorithms to perform optimally +``` diff --git a/docs/source/text_classification.mdx b/docs/source/text_classification.mdx index a5f7cf2c02..e0eebece92 100644 --- a/docs/source/text_classification.mdx +++ b/docs/source/text_classification.mdx @@ -54,95 +54,3 @@ Instead of CSV you can also use JSONL format. The JSONL format should be as foll ## Columns Your CSV dataset must have two columns: `text` and `target`. - - -### Params - -``` -❯ autotrain text-classification --help -usage: autotrain [] text-classification [-h] [--train] [--deploy] [--inference] [--username USERNAME] - [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}] - [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME - [--data-path DATA_PATH] [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT] - [--batch-size BATCH_SIZE] [--seed SEED] [--epochs EPOCHS] - [--gradient_accumulation GRADIENT_ACCUMULATION] [--disable_gradient_checkpointing] [--lr LR] - [--log {none,wandb,tensorboard}] [--text-column TEXT_COLUMN] [--target-column TARGET_COLUMN] - [--max-seq-length MAX_SEQ_LENGTH] [--warmup-ratio WARMUP_RATIO] [--optimizer OPTIMIZER] - [--scheduler SCHEDULER] [--weight-decay WEIGHT_DECAY] [--max-grad-norm MAX_GRAD_NORM] - [--logging-steps LOGGING_STEPS] [--evaluation-strategy {steps,epoch,no}] - [--save-total-limit SAVE_TOTAL_LIMIT] - [--auto-find-batch-size] [--mixed-precision {fp16,bf16,None}] - -✨ Run AutoTrain Text Classification - -options: - -h, --help show this help message and exit - --train Command to train the model - --deploy Command to deploy the model (limited availability) - --inference Command to run inference (limited availability) - --username USERNAME Hugging Face Hub Username - --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf} - Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only. - --token TOKEN Your Hugging Face API token. Token must have write access to the model hub. - --push-to-hub Push to hub after training will push the trained model to the Hugging Face model hub. - --model MODEL Base model to use for training - --project-name PROJECT_NAME - Output directory / repo id for trained model (must be unique on hub) - --data-path DATA_PATH - Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate - formats - --train-split TRAIN_SPLIT - Train dataset split to use - --valid-split VALID_SPLIT - Validation dataset split to use - --batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --epochs EPOCHS Number of training epochs - --gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps - --disable_gradient_checkpointing - Disable gradient checkpointing - --lr LR Learning rate - --log {none,wandb,tensorboard} - Use experiment tracking - --text-column TEXT_COLUMN - Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields. - Default is 'text'. - --target-column TARGET_COLUMN - Specify the column name that holds the target or label data for training. Helps in distinguishing different potential - outputs. Default is 'target'. - --max-seq-length MAX_SEQ_LENGTH - Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are - truncated. Affects both memory usage and computational requirements. Default is 128 tokens. - --warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. - --optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. - --scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. - --weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. - --max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. - --logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. - --evaluation-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. - --save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. - --auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. - --mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` \ No newline at end of file diff --git a/docs/source/text_classification_params.mdx b/docs/source/text_classification_params.mdx new file mode 100644 index 0000000000..4c50c454d2 --- /dev/null +++ b/docs/source/text_classification_params.mdx @@ -0,0 +1,54 @@ +# Text Classification & Regression Parameters + +``` +--batch-size BATCH_SIZE + Training batch size to use +--seed SEED Random seed for reproducibility +--epochs EPOCHS Number of training epochs +--gradient_accumulation GRADIENT_ACCUMULATION + Gradient accumulation steps +--disable_gradient_checkpointing + Disable gradient checkpointing +--lr LR Learning rate +--log {none,wandb,tensorboard} + Use experiment tracking +--text-column TEXT_COLUMN + Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields. + Default is 'text'. +--target-column TARGET_COLUMN + Specify the column name that holds the target or label data for training. Helps in distinguishing different potential + outputs. Default is 'target'. +--max-seq-length MAX_SEQ_LENGTH + Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are + truncated. Affects both memory usage and computational requirements. Default is 128 tokens. +--warmup-ratio WARMUP_RATIO + Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help + in stabilizing the training process early on. Default ratio is 0.1. +--optimizer OPTIMIZER + Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model + performance. 'adamw_torch' is used by default. +--scheduler SCHEDULER + Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the + learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. +--weight-decay WEIGHT_DECAY + Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large + weights. Default is 0.0, meaning no weight decay is applied. +--max-grad-norm MAX_GRAD_NORM + Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient + problem in deep neural networks. Default is 1.0. +--logging-steps LOGGING_STEPS + Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging + steps automatically. Default is -1. +--evaluation-strategy {steps,epoch,no} + Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of + each training epoch by default. +--save-total-limit SAVE_TOTAL_LIMIT + Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. + Default is to save only the latest one. +--auto-find-batch-size + Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch + size that fits in memory. +--mixed-precision {fp16,bf16,None} + Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for + default precision. Default is None. +``` \ No newline at end of file diff --git a/docs/source/token_classification.mdx b/docs/source/token_classification.mdx index a991bbc6e9..704b6ec93f 100644 --- a/docs/source/token_classification.mdx +++ b/docs/source/token_classification.mdx @@ -54,93 +54,3 @@ for chunk in pd.read_csv('example.csv', chunksize=chunk_size): ## Columns Your CSV/JSONL dataset must have two columns: `tokens` and `tags`. - -### Parameters - -``` -❯ autotrain token-classification --help -usage: autotrain [] token-classification [-h] [--train] [--deploy] [--inference] [--username USERNAME] - [--backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf}] - [--token TOKEN] [--push-to-hub] --model MODEL --project-name PROJECT_NAME - [--data-path DATA_PATH] [--train-split TRAIN_SPLIT] [--valid-split VALID_SPLIT] - [--batch-size BATCH_SIZE] [--seed SEED] [--epochs EPOCHS] - [--gradient_accumulation GRADIENT_ACCUMULATION] [--disable_gradient_checkpointing] - [--lr LR] [--log {none,wandb,tensorboard}] [--tokens-column TOKENS_COLUMN] - [--tags-column TAGS_COLUMN] [--max-seq-length MAX_SEQ_LENGTH] [--warmup-ratio WARMUP_RATIO] - [--optimizer OPTIMIZER] [--scheduler SCHEDULER] [--weight-decay WEIGHT_DECAY] - [--max-grad-norm MAX_GRAD_NORM] [--logging-steps LOGGING_STEPS] - [--evaluation-strategy {steps,epoch,no}] [--save-total-limit SAVE_TOTAL_LIMIT] - [--auto-find-batch-size] - [--mixed-precision {fp16,bf16,None}] - -✨ Run AutoTrain Token Classification - -options: - -h, --help show this help message and exit - --train Command to train the model - --deploy Command to deploy the model (limited availability) - --inference Command to run inference (limited availability) - --username USERNAME Hugging Face Hub Username - --backend {local-cli,spaces-a10gl,spaces-a10gs,spaces-a100,spaces-t4m,spaces-t4s,spaces-cpu,spaces-cpuf} - Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only. - --token TOKEN Your Hugging Face API token. Token must have write access to the model hub. - --push-to-hub Push to hub after training will push the trained model to the Hugging Face model hub. - --model MODEL Base model to use for training - --project-name PROJECT_NAME - Output directory / repo id for trained model (must be unique on hub) - --data-path DATA_PATH - Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate - formats - --train-split TRAIN_SPLIT - Train dataset split to use - --valid-split VALID_SPLIT - Validation dataset split to use - --batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --epochs EPOCHS Number of training epochs - --gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps - --disable_gradient_checkpointing - Disable gradient checkpointing - --lr LR Learning rate - --log {none,wandb,tensorboard} - Use experiment tracking - --tokens-column TOKENS_COLUMN - Tokens column to use. Must be a stringified list of tokens if using a CSV file. Default is 'tokens'. - --tags-column TAGS_COLUMN - Tags column to use. Must be a stringified list of tags if using a CSV file. Default is 'tags'. - --max-seq-length MAX_SEQ_LENGTH - Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are - truncated. Affects both memory usage and computational requirements. Default is 128 tokens. - --warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. - --optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. - --scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. - --weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. - --max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. - --logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. - --evaluation-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. - --save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. - --auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. - --mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` diff --git a/docs/source/token_classification_params.mdx b/docs/source/token_classification_params.mdx new file mode 100644 index 0000000000..21ffd6b86e --- /dev/null +++ b/docs/source/token_classification_params.mdx @@ -0,0 +1,52 @@ +# Token Classification Parameters + +``` +--batch-size BATCH_SIZE + Training batch size to use +--seed SEED Random seed for reproducibility +--epochs EPOCHS Number of training epochs +--gradient_accumulation GRADIENT_ACCUMULATION + Gradient accumulation steps +--disable_gradient_checkpointing + Disable gradient checkpointing +--lr LR Learning rate +--log {none,wandb,tensorboard} + Use experiment tracking +--tokens-column TOKENS_COLUMN + Tokens column to use. Must be a stringified list of tokens if using a CSV file. Default is 'tokens'. +--tags-column TAGS_COLUMN + Tags column to use. Must be a stringified list of tags if using a CSV file. Default is 'tags'. +--max-seq-length MAX_SEQ_LENGTH + Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are + truncated. Affects both memory usage and computational requirements. Default is 128 tokens. +--warmup-ratio WARMUP_RATIO + Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help + in stabilizing the training process early on. Default ratio is 0.1. +--optimizer OPTIMIZER + Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model + performance. 'adamw_torch' is used by default. +--scheduler SCHEDULER + Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the + learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. +--weight-decay WEIGHT_DECAY + Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large + weights. Default is 0.0, meaning no weight decay is applied. +--max-grad-norm MAX_GRAD_NORM + Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient + problem in deep neural networks. Default is 1.0. +--logging-steps LOGGING_STEPS + Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging + steps automatically. Default is -1. +--evaluation-strategy {steps,epoch,no} + Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of + each training epoch by default. +--save-total-limit SAVE_TOTAL_LIMIT + Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. + Default is to save only the latest one. +--auto-find-batch-size + Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch + size that fits in memory. +--mixed-precision {fp16,bf16,None} + Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for + default precision. Default is None. +``` diff --git a/src/autotrain/backends/endpoints.py b/src/autotrain/backends/endpoints.py index 61d5752e08..31f8293863 100644 --- a/src/autotrain/backends/endpoints.py +++ b/src/autotrain/backends/endpoints.py @@ -7,7 +7,7 @@ class EndpointsRunner(BaseBackend): - def _create(self): + def create(self): hardware = self.available_hardware[self.backend] accelerator = hardware.split("_")[2] instance_size = hardware.split("_")[3] diff --git a/src/autotrain/cli/run_dreambooth.py b/src/autotrain/cli/run_dreambooth.py index 8500663aa6..91a6b23531 100644 --- a/src/autotrain/cli/run_dreambooth.py +++ b/src/autotrain/cli/run_dreambooth.py @@ -389,4 +389,5 @@ def run(self): params = DreamBoothTrainingParams(**vars(self.args)) params = dreambooth_munge_data(params, local=self.args.backend.startswith("local")) project = AutoTrainProject(params=params, backend=self.args.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_image_classification.py b/src/autotrain/cli/run_image_classification.py index be7ae23aaf..21c08a2f78 100644 --- a/src/autotrain/cli/run_image_classification.py +++ b/src/autotrain/cli/run_image_classification.py @@ -167,4 +167,5 @@ def run(self): params = ImageClassificationParams(**vars(self.args)) params = img_clf_munge_data(params, local=self.args.backend.startswith("local")) project = AutoTrainProject(params=params, backend=self.args.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_llm.py b/src/autotrain/cli/run_llm.py index aebdc8f606..a28b3d64e0 100644 --- a/src/autotrain/cli/run_llm.py +++ b/src/autotrain/cli/run_llm.py @@ -345,4 +345,5 @@ def run(self): params = LLMTrainingParams(**vars(self.args)) params = llm_munge_data(params, local=self.args.backend.startswith("local")) project = AutoTrainProject(params=params, backend=self.args.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_seq2seq.py b/src/autotrain/cli/run_seq2seq.py index b1a2a950ee..cfb838a5af 100644 --- a/src/autotrain/cli/run_seq2seq.py +++ b/src/autotrain/cli/run_seq2seq.py @@ -206,4 +206,5 @@ def run(self): params = Seq2SeqParams(**vars(self.args)) params = seq2seq_munge_data(params, local=self.args.backend.startswith("local")) project = AutoTrainProject(params=params, backend=self.args.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_spacerunner.py b/src/autotrain/cli/run_spacerunner.py index 26f40c0d87..7b814a4268 100644 --- a/src/autotrain/cli/run_spacerunner.py +++ b/src/autotrain/cli/run_spacerunner.py @@ -138,6 +138,6 @@ def run(self): env=self.args.env, args=self.args.args, ) - sr = SpaceRunner(params=params, backend=self.args.backend) - space_id = sr.prepare() - logger.info(f"SpaceRunner created with ID: {space_id}") + project = SpaceRunner(params=params, backend=self.args.backend) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_tabular.py b/src/autotrain/cli/run_tabular.py index 86eb5bb2ad..ace5fce982 100644 --- a/src/autotrain/cli/run_tabular.py +++ b/src/autotrain/cli/run_tabular.py @@ -143,4 +143,5 @@ def run(self): params = TabularParams(**vars(self.args)) params = tabular_munge_data(params, local=self.args.backend.startswith("local")) project = AutoTrainProject(params=params, backend=self.args.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_text_classification.py b/src/autotrain/cli/run_text_classification.py index 2161babe1e..b1f6c42185 100644 --- a/src/autotrain/cli/run_text_classification.py +++ b/src/autotrain/cli/run_text_classification.py @@ -167,4 +167,5 @@ def run(self): params = TextClassificationParams(**vars(self.args)) params = text_clf_munge_data(params, local=self.args.backend.startswith("local")) project = AutoTrainProject(params=params, backend=self.args.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_text_regression.py b/src/autotrain/cli/run_text_regression.py index 5dfeb78a9f..929e93fbec 100644 --- a/src/autotrain/cli/run_text_regression.py +++ b/src/autotrain/cli/run_text_regression.py @@ -167,4 +167,5 @@ def run(self): params = TextRegressionParams(**vars(self.args)) params = text_reg_munge_data(params, local=self.args.backend.startswith("local")) project = AutoTrainProject(params=params, backend=self.args.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_token_classification.py b/src/autotrain/cli/run_token_classification.py index 35cd497730..ce98412338 100644 --- a/src/autotrain/cli/run_token_classification.py +++ b/src/autotrain/cli/run_token_classification.py @@ -167,4 +167,5 @@ def run(self): params = TokenClassificationParams(**vars(self.args)) params = token_clf_munge_data(params, local=self.args.backend.startswith("local")) project = AutoTrainProject(params=params, backend=self.args.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/parser.py b/src/autotrain/parser.py index 940b0a6ecd..26dc628ed4 100644 --- a/src/autotrain/parser.py +++ b/src/autotrain/parser.py @@ -145,4 +145,5 @@ def run(self): _munge_fn = self.munge_data_map[self.task] _munge_fn(_params, local=self.backend.startswith("local")) project = AutoTrainProject(params=_params, backend=self.backend) - _ = project.create() + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/preprocessor/dreambooth.py b/src/autotrain/preprocessor/dreambooth.py index b821c1d3b1..bd476a4ada 100644 --- a/src/autotrain/preprocessor/dreambooth.py +++ b/src/autotrain/preprocessor/dreambooth.py @@ -35,9 +35,14 @@ def __post_init__(self): def _upload_concept_images(self, file, api): logger.info(f"Uploading {file} to concept1") + if isinstance(file, str): + path_in_repo = f"concept1/{file.split('/')[-1]}" + else: + path_in_repo = f"concept1/{file.filename.split('/')[-1]}" + api.upload_file( - path_or_fileobj=file.file.read(), - path_in_repo=f"concept1/{file.filename.split('/')[-1]}", + path_or_fileobj=file if isinstance(file, str) else file.file.read(), + path_in_repo=path_in_repo, repo_id=self.repo_name, repo_type="dataset", token=self.token, diff --git a/src/autotrain/project.py b/src/autotrain/project.py index 3423ea4924..20fab72fd2 100644 --- a/src/autotrain/project.py +++ b/src/autotrain/project.py @@ -51,18 +51,18 @@ def __post_init__(self): def create(self): if self.backend.startswith("local"): runner = LocalRunner(params=self.params, backend=self.backend) - runner.create() + return runner.create() elif self.backend.startswith("spaces-"): runner = SpaceRunner(params=self.params, backend=self.backend) - runner.create() + return runner.create() elif self.backend.startswith("ep-"): runner = EndpointsRunner(params=self.params, backend=self.backend) - runner.create() + return runner.create() elif self.backend.startswith("ngc-"): runner = NGCRunner(params=self.params, backend=self.backend) - runner.create() + return runner.create() elif self.backend.startswith("nvcf-"): runner = NVCFRunner(params=self.params, backend=self.backend) - runner.create() + return runner.create() else: raise NotImplementedError diff --git a/static/autotrain_space.png b/static/autotrain_space.png new file mode 100644 index 0000000000..8cc8d1e56a Binary files /dev/null and b/static/autotrain_space.png differ diff --git a/static/duplicate_space.png b/static/duplicate_space.png new file mode 100644 index 0000000000..22926aae59 Binary files /dev/null and b/static/duplicate_space.png differ