diff --git a/requirements.txt b/requirements.txt index 6dc131cf80..ab674800d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,6 @@ nltk==3.8.1 optuna==3.3.0 Pillow==10.0.0 protobuf==4.23.4 -pydantic==1.10.11 sacremoses==0.0.53 scikit-learn==1.3.0 sentencepiece==0.1.99 @@ -20,7 +19,6 @@ werkzeug==2.3.6 xgboost==1.7.6 huggingface_hub>=0.16.4 requests==2.31.0 -gradio==3.41.0 einops==0.6.1 invisible-watermark==0.2.0 packaging==23.1 @@ -35,4 +33,5 @@ diffusers==0.21.4 bitsandbytes==0.41.0 # extras rouge_score==0.1.2 -py7zr==0.20.6 \ No newline at end of file +py7zr==0.20.6 +fastapi==0.104.1 \ No newline at end of file diff --git a/src/autotrain/api.py b/src/autotrain/api.py index 29334eced3..02a3e1c06e 100644 --- a/src/autotrain/api.py +++ b/src/autotrain/api.py @@ -38,10 +38,7 @@ def run_training(): params = json.loads(PARAMS) logger.info(params) if TASK_ID == 9: - try: - params = LLMTrainingParams.parse_raw(params) - except Exception: - params = LLMTrainingParams.parse_obj(params) + params = LLMTrainingParams.model_validate_json(params) params.project_name = "/tmp/model" params.save(output_dir=params.project_name) cmd = ["accelerate", "launch", "--num_machines", "1", "--num_processes", "1"] @@ -60,7 +57,7 @@ def run_training(): ] ) elif TASK_ID == 28: - params = Seq2SeqParams.parse_raw(params) + params = Seq2SeqParams.model_validate_json(params) params.project_name = "/tmp/model" params.save(output_dir=params.project_name) cmd = ["accelerate", "launch", "--num_machines", "1", "--num_processes", "1"] @@ -79,7 +76,7 @@ def run_training(): ] ) elif TASK_ID in (1, 2): - params = TextClassificationParams.parse_raw(params) + params = TextClassificationParams.model_validate_json(params) params.project_name = "/tmp/model" params.save(output_dir=params.project_name) cmd = ["accelerate", "launch", "--num_machines", "1", "--num_processes", "1"] @@ -98,7 +95,7 @@ def run_training(): ] ) elif TASK_ID in (13, 14, 15, 16, 26): - params = TabularParams.parse_raw(params) + params = TabularParams.model_validate_json(params) params.project_name = "/tmp/model" params.save(output_dir=params.project_name) cmd = [ @@ -109,7 +106,7 @@ def run_training(): os.path.join(params.project_name, "training_params.json"), ] elif TASK_ID == 27: - params = GenericParams.parse_raw(params) + params = GenericParams.model_validate_json(params) params.project_name = "/tmp/model" params.save(output_dir=params.project_name) cmd = [ @@ -120,7 +117,7 @@ def run_training(): os.path.join(params.project_name, "training_params.json"), ] elif TASK_ID == 25: - params = DreamBoothTrainingParams.parse_raw(params) + params = DreamBoothTrainingParams.model_validate_json(params) params.project_name = "/tmp/model" params.save(output_dir=params.project_name) cmd = [ diff --git a/src/autotrain/app.py b/src/autotrain/app.py index b25c6d62f0..040318eac8 100644 --- a/src/autotrain/app.py +++ b/src/autotrain/app.py @@ -1,965 +1,298 @@ import json import os -import random -import string -import zipfile +from typing import List -import gradio as gr import pandas as pd -from huggingface_hub import list_models +from fastapi import FastAPI, File, Form, Request, UploadFile +from fastapi.responses import HTMLResponse, JSONResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from loguru import logger -from autotrain import logger from autotrain.dataset import AutoTrainDataset, AutoTrainDreamboothDataset, AutoTrainImageClassificationDataset -from autotrain.languages import SUPPORTED_LANGUAGES -from autotrain.params import Params -from autotrain.project import Project -from autotrain.utils import get_project_cost, get_user_token, user_authentication - - -APP_TASKS = { - "Natural Language Processing": ["Text Classification"], - # "Tabular": TABULAR_TASKS, - "Computer Vision": ["Image Classification", "Dreambooth"], -} - -APP_TASKS_MAPPING = { - "Text Classification": "text_multi_class_classification", - "LLM Finetuning": "lm_training", - "Image Classification": "image_multi_class_classification", - "Dreambooth": "dreambooth", -} - -APP_TASK_TYPE_MAPPING = { - "text_classification": "Natural Language Processing", - "lm_training": "Natural Language Processing", - "image_classification": "Computer Vision", - "dreambooth": "Computer Vision", -} - -ALLOWED_FILE_TYPES = [ - ".csv", - ".CSV", - ".jsonl", - ".JSONL", - ".zip", - ".ZIP", - ".png", - ".PNG", - ".jpg", - ".JPG", - ".jpeg", - ".JPEG", +from autotrain.project import AutoTrainProject +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.dreambooth.params import DreamBoothTrainingParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams + + +HF_TOKEN = os.environ.get("HF_TOKEN", None) +HF_USERNAME = os.environ.get("HF_USERNAME", None) + +HIDDEN_PARAMS = [ + "token", + "project_name", + "username", + "task", + "backend", + "repo_id", + "train_split", + "valid_split", + "text_column", + "rejected_text_column", + "prompt_text_column", + "push_to_hub", + "trainer", + "model", + "data_path", + "image_path", + "class_image_path", + "revision", + "tokenizer", + "class_prompt", + "num_class_images", + "class_labels_conditioning", + "resume_from_checkpoint", + "dataloader_num_workers", + "allow_tf32", + "prior_generation_precision", + "local_rank", + "tokenizer_max_length", + "rank", + "xl", + "checkpoints_total_limit", + "validation_images", + "validation_epochs", + "num_validation_images", + "validation_prompt", + "sample_batch_size", + "log", + "image_column", + "target_column", + "id_column", + "target_columns", ] - -def _login_user(user_token): - user_info = user_authentication(token=user_token) - username = user_info["name"] - - user_can_pay = user_info["canPay"] - orgs = user_info["orgs"] - - valid_orgs = [org for org in orgs if org["canPay"] is True] - valid_orgs = [org for org in valid_orgs if org["roleInOrg"] in ("admin", "write")] - valid_orgs = [org["name"] for org in valid_orgs] - - valid_can_pay = [username] + valid_orgs if user_can_pay else valid_orgs - who_is_training = [username] + [org["name"] for org in orgs] - return user_token, valid_can_pay, who_is_training - - -def _update_task_type(project_type): - return gr.Dropdown.update( - value=APP_TASKS[project_type][0], - choices=APP_TASKS[project_type], - visible=True, - ) - - -def _update_model_choice(task, autotrain_backend): - # TODO: add tabular and remember, for tabular, we only support AutoTrain - if autotrain_backend.lower() != "huggingface internal": - model_choice = ["HuggingFace Hub"] - return gr.Dropdown.update( - value=model_choice[0], - choices=model_choice, - visible=True, - ) - - if task == "LLM Finetuning": - model_choice = ["HuggingFace Hub"] - else: - model_choice = ["AutoTrain", "HuggingFace Hub"] - - return gr.Dropdown.update( - value=model_choice[0], - choices=model_choice, - visible=True, - ) - - -def _update_file_type(task): - task = APP_TASKS_MAPPING[task] - if task in ("text_multi_class_classification", "lm_training"): - return gr.Radio.update( - value="CSV", - choices=["CSV", "JSONL"], - visible=True, - ) - elif task == "image_multi_class_classification": - return gr.Radio.update( - value="ZIP", - choices=["Image Subfolders", "ZIP"], - visible=True, - ) - elif task == "dreambooth": - return gr.Radio.update( - value="ZIP", - choices=["Image Folder", "ZIP"], - visible=True, - ) - else: - raise NotImplementedError - - -def _update_param_choice(model_choice, autotrain_backend): - logger.info(f"model_choice: {model_choice}") - choices = ["AutoTrain", "Manual"] if model_choice == "HuggingFace Hub" else ["AutoTrain"] - choices = ["Manual"] if autotrain_backend != "HuggingFace Internal" else choices - return gr.Dropdown.update( - value=choices[0], - choices=choices, - visible=True, - ) - - -def _project_type_update(project_type, task_type, autotrain_backend): - logger.info(f"project_type: {project_type}, task_type: {task_type}") - task_choices_update = _update_task_type(project_type) - model_choices_update = _update_model_choice(task_choices_update["value"], autotrain_backend) - param_choices_update = _update_param_choice(model_choices_update["value"], autotrain_backend) - return [ - task_choices_update, - model_choices_update, - param_choices_update, - _update_hub_model_choices(task_choices_update["value"], model_choices_update["value"]), - ] - - -def _task_type_update(task_type, autotrain_backend): - logger.info(f"task_type: {task_type}") - model_choices_update = _update_model_choice(task_type, autotrain_backend) - param_choices_update = _update_param_choice(model_choices_update["value"], autotrain_backend) - return [ - model_choices_update, - param_choices_update, - _update_hub_model_choices(task_type, model_choices_update["value"]), - ] - - -def _update_col_map(training_data, task): - task = APP_TASKS_MAPPING[task] - if task == "text_multi_class_classification": - data_cols = pd.read_csv(training_data[0].name, nrows=2).columns.tolist() - return [ - gr.Dropdown.update(visible=True, choices=data_cols, label="Map `text` column", value=data_cols[0]), - gr.Dropdown.update(visible=True, choices=data_cols, label="Map `target` column", value=data_cols[1]), - gr.Text.update(visible=False), - ] - elif task == "lm_training": - data_cols = pd.read_csv(training_data[0].name, nrows=2).columns.tolist() - return [ - gr.Dropdown.update(visible=True, choices=data_cols, label="Map `text` column", value=data_cols[0]), - gr.Dropdown.update(visible=False), - gr.Text.update(visible=False), - ] - elif task == "dreambooth": - return [ - gr.Dropdown.update(visible=False), - gr.Dropdown.update(visible=False), - gr.Text.update(visible=True, label="Concept Token", interactive=True), - ] - else: - return [ - gr.Dropdown.update(visible=False), - gr.Dropdown.update(visible=False), - gr.Text.update(visible=False), - ] - - -def _estimate_costs( - training_data, validation_data, task, user_token, autotrain_username, training_params_txt, autotrain_backend -): - if autotrain_backend.lower() != "huggingface internal": - return [ - gr.Markdown.update( - value="Cost estimation is not available for this backend", - visible=True, - ), - gr.Number.update(visible=False), - ] +PARAMS = {} +PARAMS["llm"] = LLMTrainingParams( + target_modules="", + log="tensorboard", + fp16=True, + use_int4=True, + use_int8=False, + use_peft=True, + block_size=1024, + epochs=3, +).model_dump() + +PARAMS["text-classification"] = TextClassificationParams().model_dump() +PARAMS["image-classification"] = ImageClassificationParams().model_dump() +PARAMS["dreambooth"] = DreamBoothTrainingParams( + prompt="", + num_steps=500, + gradient_checkpointing=True, + fp16=True, + batch_size=1, + gradient_accumulation=4, + lr=1e-4, +).model_dump() +PARAMS["seq2seq"] = Seq2SeqParams().model_dump() +PARAMS["tabular"] = TabularParams().model_dump() + +app = FastAPI() +# app.mount("/css", StaticFiles(directory="css"), name="css") +app.mount("/static", StaticFiles(directory="static"), name="static") +templates = Jinja2Templates(directory="templates") + + +async def get_request_data(request: Request): + # Request headers + headers = dict(request.headers) + + # Request method + method = request.method + + # Request URL + url = str(request.url) + + # Client host information + client_host = request.client.host + + # Request body + body = await request.body() try: - logger.info("Estimating costs....") - if training_data is None: - return [ - gr.Markdown.update( - value="Could not estimate cost. Please add training data", - visible=True, - ), - gr.Number.update(visible=False), - ] - if validation_data is None: - validation_data = [] - - training_params = json.loads(training_params_txt) - if len(training_params) == 0: - return [ - gr.Markdown.update( - value="Could not estimate cost. Please add atleast one job", - visible=True, - ), - gr.Number.update(visible=False), - ] - elif len(training_params) == 1: - if "num_models" in training_params[0]: - num_models = training_params[0]["num_models"] - else: - num_models = 1 - else: - num_models = len(training_params) - task = APP_TASKS_MAPPING[task] - num_samples = 0 - logger.info("Estimating number of samples") - if task in ("text_multi_class_classification", "lm_training"): - for _f in training_data: - num_samples += pd.read_csv(_f.name).shape[0] - for _f in validation_data: - num_samples += pd.read_csv(_f.name).shape[0] - elif task == "image_multi_class_classification": - logger.info(f"training_data: {training_data}") - if len(training_data) > 1: - return [ - gr.Markdown.update( - value="Only one training file is supported for image classification", - visible=True, - ), - gr.Number.update(visible=False), + body = body.decode("utf-8") + except UnicodeDecodeError: + body = str(body) + + return {"headers": headers, "method": method, "url": url, "client_host": client_host, "body": body} + + +@app.get("/", response_class=HTMLResponse) +async def read_form(request: Request): + """ + This function is used to render the HTML file + :param request: + :return: + """ + if HF_TOKEN is None or HF_USERNAME is None: + return templates.TemplateResponse("error.html", {"request": request}) + return templates.TemplateResponse("index.html", {"request": request}) # The form.html is your saved html file + + +@app.get("/params/{task}", response_class=JSONResponse) +async def fetch_params(task: str): + """ + This function is used to fetch the parameters for a given task + :param task: str + :return: JSONResponse + """ + logger.info(f"Task: {task}") + if task.startswith("llm"): + trainer = task.split(":")[1].lower() + task = task.split(":")[0].lower() + + if task.startswith("tabular"): + task = "tabular" + + if task in PARAMS: + task_params = PARAMS[task] + task_params = {k: v for k, v in task_params.items() if k not in HIDDEN_PARAMS} + if task == "llm": + more_hidden_params = [] + if trainer in ("sft", "reward"): + more_hidden_params = [ + "model_ref", + "dpo_beta", + "add_eos_token", ] - if len(validation_data) > 1: - return [ - gr.Markdown.update( - value="Only one validation file is supported for image classification", - visible=True, - ), - gr.Number.update(visible=False), + elif trainer == "generic": + more_hidden_params = [ + "model_ref", + "dpo_beta", ] - for _f in training_data: - zip_ref = zipfile.ZipFile(_f.name, "r") - for _ in zip_ref.namelist(): - num_samples += 1 - for _f in validation_data: - zip_ref = zipfile.ZipFile(_f.name, "r") - for _ in zip_ref.namelist(): - num_samples += 1 - elif task == "dreambooth": - num_samples = len(training_data) - else: - raise NotImplementedError - - logger.info(f"Estimating costs for: num_models: {num_models}, task: {task}, num_samples: {num_samples}") - estimated_cost = get_project_cost( - username=autotrain_username, - token=user_token, - task=task, - num_samples=num_samples, - num_models=num_models, - ) - logger.info(f"Estimated_cost: {estimated_cost}") - return [ - gr.Markdown.update( - value=f"Estimated cost: ${estimated_cost:.2f}. Note: clicking on 'Create Project' will start training and incur charges!", - visible=True, - ), - gr.Number.update(visible=False), - ] - except Exception as e: - logger.error(e) - logger.error("Could not estimate cost, check inputs") - return [ - gr.Markdown.update( - value="Could not estimate cost, check inputs", - visible=True, - ), - gr.Number.update(visible=False), - ] - - -def get_job_params(param_choice, training_params, task): - if param_choice == "autotrain": - if len(training_params) > 1: - raise ValueError("โŒ Only one job parameter is allowed for AutoTrain.") - training_params[0].update({"task": task}) - elif param_choice.lower() == "manual": - for i in range(len(training_params)): - training_params[i].update({"task": task}) - if "hub_model" in training_params[i]: - # remove hub_model from training_params - training_params[i].pop("hub_model") - return training_params - - -def _update_project_name(): - random_project_name = "-".join( - ["".join(random.choices(string.ascii_lowercase + string.digits, k=4)) for _ in range(3)] - ) - # check if training tracker exists - if os.path.exists(os.path.join("/tmp", "training")): - return [ - gr.Text.update(value=random_project_name, visible=True, interactive=True), - gr.Button.update(interactive=False), - ] - return [ - gr.Text.update(value=random_project_name, visible=True, interactive=True), - gr.Button.update(interactive=True), - ] - - -def _update_hub_model_choices(task, model_choice): - task = APP_TASKS_MAPPING[task] - logger.info(f"Updating hub model choices for task: {task}, model_choice: {model_choice}") - if model_choice.lower() == "autotrain": - return gr.Dropdown.update( - visible=False, - interactive=False, - ) - if task == "text_multi_class_classification": - hub_models1 = list_models(filter="fill-mask", sort="downloads", direction=-1, limit=100) - hub_models2 = list_models(filter="text-classification", sort="downloads", direction=-1, limit=100) - hub_models = list(hub_models1) + list(hub_models2) - elif task == "lm_training": - hub_models = list(list_models(filter="text-generation", sort="downloads", direction=-1, limit=100)) - elif task == "image_multi_class_classification": - hub_models = list(list_models(filter="image-classification", sort="downloads", direction=-1, limit=100)) - elif task == "dreambooth": - hub_models = list(list_models(filter="text-to-image", sort="downloads", direction=-1, limit=100)) - else: - raise NotImplementedError - # sort by number of downloads in descending order - hub_models = [{"id": m.modelId, "downloads": m.downloads} for m in hub_models if m.private is False] - hub_models = sorted(hub_models, key=lambda x: x["downloads"], reverse=True) - - if task == "dreambooth": - choices = ["stabilityai/stable-diffusion-xl-base-1.0"] + [m["id"] for m in hub_models] - value = choices[0] - return gr.Dropdown.update( - choices=choices, - value=value, - visible=True, - interactive=True, - ) - - return gr.Dropdown.update( - choices=[m["id"] for m in hub_models], - value=hub_models[0]["id"], - visible=True, - interactive=True, - ) - - -def _update_backend(backend): - if backend != "Hugging Face Internal": - return [ - gr.Dropdown.update( - visible=True, - interactive=True, - choices=["HuggingFace Hub"], - value="HuggingFace Hub", - ), - gr.Dropdown.update( - visible=True, - interactive=True, - choices=["Manual"], - value="Manual", - ), - ] - return [ - gr.Dropdown.update( - visible=True, - interactive=True, - ), - gr.Dropdown.update( - visible=True, - interactive=True, - ), - ] - - -def _create_project( - autotrain_username, - valid_can_pay, - project_name, - user_token, - task, - training_data, - validation_data, - col_map_text, - col_map_label, - concept_token, - training_params_txt, - hub_model, - estimated_cost, - autotrain_backend, + elif trainer == "dpo": + more_hidden_params = [ + "add_eos_token", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "dreambooth": + more_hidden_params = [ + "epochs", + "logging", + "bf16", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + return task_params + return {"error": "Task not found"} + + +@app.post("/create_project", response_class=JSONResponse) +async def handle_form( + project_name: str = Form(...), + task: str = Form(...), + base_model: str = Form(...), + hardware: str = Form(...), + params: str = Form(...), + data_files_training: List[UploadFile] = File(...), + data_files_valid: List[UploadFile] = File(...), ): - task = APP_TASKS_MAPPING[task] - valid_can_pay = valid_can_pay.split(",") - can_pay = autotrain_username in valid_can_pay - logger.info(f"๐Ÿšจ๐Ÿšจ๐ŸšจCreating project: {project_name}") - logger.info(f"๐ŸšจTask: {task}") - logger.info(f"๐ŸšจTraining data: {training_data}") - logger.info(f"๐ŸšจValidation data: {validation_data}") - logger.info(f"๐ŸšจTraining params: {training_params_txt}") - logger.info(f"๐ŸšจHub model: {hub_model}") - logger.info(f"๐ŸšจEstimated cost: {estimated_cost}") - logger.info(f"๐Ÿšจ:Can pay: {can_pay}") - - if can_pay is False and estimated_cost > 0: - raise gr.Error("โŒ You do not have enough credits to create this project. Please add a valid payment method.") - - training_params = json.loads(training_params_txt) - if len(training_params) == 0: - raise gr.Error("Please add atleast one job") - elif len(training_params) == 1: - if "num_models" in training_params[0]: - param_choice = "autotrain" - else: - param_choice = "manual" - else: - param_choice = "manual" - - if task == "image_multi_class_classification": - training_data = training_data[0].name - if validation_data is not None: - validation_data = validation_data[0].name - dset = AutoTrainImageClassificationDataset( - train_data=training_data, - token=user_token, + """ + This function is used to handle the form submission + """ + + # if HF_TOKEN is None or HF_USERNAME is None, return error + if HF_TOKEN is None or HF_USERNAME is None: + return {"error": "HF_TOKEN or HF_USERNAME not set"} + + params = json.loads(params) + training_files = [f.file for f in data_files_training if f.filename != ""] + validation_files = [f.file for f in data_files_valid if f.filename != ""] if data_files_valid else [] + + if task.startswith("llm"): + trainer = task.split(":")[1].lower() + col_map = {"text": "text"} + if trainer == "reward": + col_map["rejected_text"] = "rejected_text" + if trainer == "dpo": + col_map["prompt"] = "prompt" + col_map["rejected_text"] = "rejected_text" + dset = AutoTrainDataset( + train_data=training_files, + task="lm_training", + token=HF_TOKEN, project_name=project_name, - username=autotrain_username, - valid_data=validation_data, + username=HF_USERNAME, + column_mapping=col_map, + valid_data=validation_files, percent_valid=None, # TODO: add to UI ) - elif task == "text_multi_class_classification": - training_data = [f.name for f in training_data] - if validation_data is None: - validation_data = [] - else: - validation_data = [f.name for f in validation_data] + dset.prepare() + elif task == "text-classification": dset = AutoTrainDataset( - train_data=training_data, - task=task, - token=user_token, + train_data=training_files, + task="text_multi_class_classification", + token=HF_TOKEN, + project_name=project_name, + username=HF_USERNAME, + column_mapping={"text": "text", "label": "target"}, + valid_data=validation_files, + percent_valid=None, # TODO: add to UI + convert_to_class_label=True, + ) + dset.prepare() + elif task == "seq2seq": + dset = AutoTrainDataset( + train_data=training_files, + task="seq2seq", + token=HF_TOKEN, project_name=project_name, - username=autotrain_username, - column_mapping={"text": col_map_text, "label": col_map_label}, - valid_data=validation_data, + username=HF_USERNAME, + column_mapping={"text": "text", "label": "target"}, + valid_data=validation_files, percent_valid=None, # TODO: add to UI ) - elif task == "lm_training": - training_data = [f.name for f in training_data] - if validation_data is None: - validation_data = [] + dset.prepare() + elif task.startswith("tabular"): + trainer = task.split(":")[1].lower() + if trainer == "classification": + task = "tabular_multi_class_classification" + elif trainer == "regression": + task = "tabular_single_column_regression" else: - validation_data = [f.name for f in validation_data] + return {"error": "Unknown subtask"} dset = AutoTrainDataset( - train_data=training_data, + train_data=training_files, task=task, - token=user_token, + token=HF_TOKEN, + project_name=project_name, + username=HF_USERNAME, + column_mapping={"id": "id", "label": ["target"]}, + valid_data=validation_files, + percent_valid=None, # TODO: add to UI + ) + dset.prepare() + elif task == "image-classification": + dset = AutoTrainImageClassificationDataset( + train_data=training_files, + token=HF_TOKEN, project_name=project_name, - username=autotrain_username, - column_mapping={"text": col_map_text}, - valid_data=validation_data, + username=HF_USERNAME, + valid_data=validation_files, percent_valid=None, # TODO: add to UI ) + dset.prepare() elif task == "dreambooth": dset = AutoTrainDreamboothDataset( - concept_images=training_data, - concept_name=concept_token, - token=user_token, + concept_images=data_files_training, + concept_name=params["prompt"], + token=HF_TOKEN, project_name=project_name, - username=autotrain_username, + username=HF_USERNAME, + use_v2=True, ) + dset.prepare() else: - raise NotImplementedError - - dset.prepare() - project = Project( - dataset=dset, - param_choice=param_choice, - hub_model=hub_model, - job_params=get_job_params(param_choice, training_params, task), - ) - if autotrain_backend.lower() == "huggingface internal": - project_id = project.create() - project.approve(project_id) - return gr.Markdown.update( - value=f"Project created successfully. Monitor progess on the [dashboard](https://ui.autotrain.huggingface.co/{project_id}/trainings).", - visible=True, - ) - else: - project.create(local=True) - - -def get_variable_name(var, namespace): - for name in namespace: - if namespace[name] is var: - return name - return None - - -def disable_create_project_button(): - return gr.Button.update(interactive=False) - - -def main(): - with gr.Blocks(theme="freddyaboulton/dracula_revamped") as demo: - gr.Markdown("## ๐Ÿค— AutoTrain Advanced") - user_token = os.environ.get("HF_TOKEN", "") + return {"error": "Task not supported yet"} - if len(user_token) == 0: - user_token = get_user_token() - - if user_token is None: - gr.Markdown( - """Please login with a write [token](https://huggingface.co/settings/tokens). - Pass your HF token in an environment variable called `HF_TOKEN` and then restart this app. - """ - ) - return demo - - user_token, valid_can_pay, who_is_training = _login_user(user_token) - - if user_token is None or len(user_token) == 0: - gr.Error("Please login with a write token.") - - user_token = gr.Textbox( - value=user_token, type="password", lines=1, max_lines=1, visible=False, interactive=False - ) - valid_can_pay = gr.Textbox(value=",".join(valid_can_pay), visible=False, interactive=False) - with gr.Row(): - with gr.Column(): - with gr.Row(): - autotrain_username = gr.Dropdown( - label="AutoTrain Username", - choices=who_is_training, - value=who_is_training[0] if who_is_training else "", - ) - autotrain_backend = gr.Dropdown( - label="AutoTrain Backend", - choices=["HuggingFace Internal", "HuggingFace Spaces"], - value="HuggingFace Internal", - interactive=True, - ) - with gr.Row(): - project_name = gr.Textbox(label="Project name", value="", lines=1, max_lines=1, interactive=True) - project_type = gr.Dropdown( - label="Project Type", choices=list(APP_TASKS.keys()), value=list(APP_TASKS.keys())[0] - ) - task_type = gr.Dropdown( - label="Task", - choices=APP_TASKS[list(APP_TASKS.keys())[0]], - value=APP_TASKS[list(APP_TASKS.keys())[0]][0], - interactive=True, - ) - model_choice = gr.Dropdown( - label="Model Choice", - choices=["AutoTrain", "HuggingFace Hub"], - value="AutoTrain", - visible=True, - interactive=True, - ) - hub_model = gr.Dropdown( - label="Hub Model", - value="", - visible=False, - interactive=True, - elem_id="hub_model", - ) - gr.Markdown("
") - with gr.Row(): - with gr.Column(): - with gr.Tabs(elem_id="tabs"): - with gr.TabItem("Data"): - with gr.Column(): - # file_type_training = gr.Radio( - # label="File Type", - # choices=["CSV", "JSONL"], - # value="CSV", - # visible=True, - # interactive=True, - # ) - training_data = gr.File( - label="Training Data", - file_types=ALLOWED_FILE_TYPES, - file_count="multiple", - visible=True, - interactive=True, - elem_id="training_data_box", - ) - with gr.Accordion("Validation Data (Optional)", open=False): - validation_data = gr.File( - label="Validation Data (Optional)", - file_types=ALLOWED_FILE_TYPES, - file_count="multiple", - visible=True, - interactive=True, - elem_id="validation_data_box", - ) - with gr.Row(): - col_map_text = gr.Dropdown( - label="Text Column", choices=[], visible=False, interactive=True - ) - col_map_target = gr.Dropdown( - label="Target Column", choices=[], visible=False, interactive=True - ) - concept_token = gr.Text( - value="", visible=False, interactive=True, lines=1, max_lines=1 - ) - with gr.TabItem("Params"): - with gr.Row(): - source_language = gr.Dropdown( - label="Source Language", - choices=SUPPORTED_LANGUAGES[:-1], - value="en", - visible=True, - interactive=True, - elem_id="source_language", - ) - num_models = gr.Slider( - label="Number of Models", - minimum=1, - maximum=25, - value=5, - step=1, - visible=True, - interactive=True, - elem_id="num_models", - ) - target_language = gr.Dropdown( - label="Target Language", - choices=["fr"], - value="fr", - visible=False, - interactive=True, - elem_id="target_language", - ) - image_size = gr.Number( - label="Image Size", - value=512, - visible=False, - interactive=True, - elem_id="image_size", - ) - - with gr.Row(): - learning_rate = gr.Number( - label="Learning Rate", - value=5e-5, - visible=False, - interactive=True, - elem_id="learning_rate", - ) - batch_size = gr.Number( - label="Train Batch Size", - value=32, - visible=False, - interactive=True, - elem_id="train_batch_size", - ) - num_epochs = gr.Number( - label="Number of Epochs", - value=3, - visible=False, - interactive=True, - elem_id="num_train_epochs", - ) - with gr.Row(): - gradient_accumulation_steps = gr.Number( - label="Gradient Accumulation Steps", - value=1, - visible=False, - interactive=True, - elem_id="gradient_accumulation_steps", - ) - percentage_warmup_steps = gr.Number( - label="Percentage of Warmup Steps", - value=0.1, - visible=False, - interactive=True, - elem_id="percentage_warmup", - ) - weight_decay = gr.Number( - label="Weight Decay", - value=0.01, - visible=False, - interactive=True, - elem_id="weight_decay", - ) - with gr.Row(): - lora_r = gr.Number( - label="LoraR", - value=16, - visible=False, - interactive=True, - elem_id="lora_r", - ) - lora_alpha = gr.Number( - label="LoraAlpha", - value=32, - visible=False, - interactive=True, - elem_id="lora_alpha", - ) - lora_dropout = gr.Number( - label="Lora Dropout", - value=0.1, - visible=False, - interactive=True, - elem_id="lora_dropout", - ) - with gr.Row(): - db_num_steps = gr.Number( - label="Num Steps", - value=500, - visible=False, - interactive=True, - elem_id="num_steps", - ) - with gr.Row(): - optimizer = gr.Dropdown( - label="Optimizer", - choices=["adamw_torch", "adamw_hf", "sgd", "adafactor", "adagrad"], - value="adamw_torch", - visible=False, - interactive=True, - elem_id="optimizer", - ) - scheduler = gr.Dropdown( - label="Scheduler", - choices=["linear", "cosine"], - value="linear", - visible=False, - interactive=True, - elem_id="scheduler", - ) - - add_job_button = gr.Button( - value="Add Job", - visible=True, - interactive=True, - elem_id="add_job", - ) - # clear_jobs_button = gr.Button( - # value="Clear Jobs", - # visible=True, - # interactive=True, - # elem_id="clear_jobs", - # ) - gr.Markdown("
") - estimated_costs_md = gr.Markdown(value="Estimated Costs: N/A", visible=True, interactive=False) - estimated_costs_num = gr.Number(value=0, visible=False, interactive=False) - create_project_button = gr.Button( - value="Create Project", - visible=True, - interactive=True, - elem_id="create_project", - ) - with gr.Column(): - param_choice = gr.Dropdown( - label="Param Choice", - choices=["AutoTrain"], - value="AutoTrain", - visible=True, - interactive=True, - ) - training_params_txt = gr.Text(value="[]", visible=False, interactive=False) - training_params_md = gr.DataFrame(visible=False, interactive=False) - - final_output = gr.Markdown(value="", visible=True, interactive=False) - hyperparameters = [ - hub_model, - num_models, - source_language, - target_language, - learning_rate, - batch_size, - num_epochs, - gradient_accumulation_steps, - lora_r, - lora_alpha, - lora_dropout, - optimizer, - scheduler, - percentage_warmup_steps, - weight_decay, - db_num_steps, - image_size, - ] - - def _update_params(params_data): - _task = params_data[task_type] - _task = APP_TASKS_MAPPING[_task] - params = Params( - task=_task, - param_choice="autotrain" if params_data[param_choice] == "AutoTrain" else "manual", - model_choice="autotrain" if params_data[model_choice] == "AutoTrain" else "hub_model", - ) - params = params.get() - visible_params = [] - for param in hyperparameters: - if param.elem_id in params.keys(): - visible_params.append(param.elem_id) - op = [h.update(visible=h.elem_id in visible_params) for h in hyperparameters] - op.append(add_job_button.update(visible=True)) - op.append(training_params_md.update(visible=False)) - op.append(training_params_txt.update(value="[]")) - return op - - autotrain_backend.change( - _project_type_update, - inputs=[project_type, task_type, autotrain_backend], - outputs=[task_type, model_choice, param_choice, hub_model], - ) - - project_type.change( - _project_type_update, - inputs=[project_type, task_type, autotrain_backend], - outputs=[task_type, model_choice, param_choice, hub_model], - ) - task_type.change( - _task_type_update, - inputs=[task_type, autotrain_backend], - outputs=[model_choice, param_choice, hub_model], - ) - model_choice.change( - _update_param_choice, - inputs=[model_choice, autotrain_backend], - outputs=param_choice, - ).then( - _update_hub_model_choices, - inputs=[task_type, model_choice], - outputs=hub_model, - ) - - param_choice.change( - _update_params, - inputs=set([task_type, param_choice, model_choice] + hyperparameters + [add_job_button]), - outputs=hyperparameters + [add_job_button, training_params_md, training_params_txt], - ) - task_type.change( - _update_params, - inputs=set([task_type, param_choice, model_choice] + hyperparameters + [add_job_button]), - outputs=hyperparameters + [add_job_button, training_params_md, training_params_txt], - ) - model_choice.change( - _update_params, - inputs=set([task_type, param_choice, model_choice] + hyperparameters + [add_job_button]), - outputs=hyperparameters + [add_job_button, training_params_md, training_params_txt], - ) - - def _add_job(params_data): - _task = params_data[task_type] - _task = APP_TASKS_MAPPING[_task] - _param_choice = "autotrain" if params_data[param_choice] == "AutoTrain" else "manual" - _model_choice = "autotrain" if params_data[model_choice] == "AutoTrain" else "hub_model" - if _model_choice == "hub_model" and params_data[hub_model] is None: - logger.error("Hub model is None") - return - _training_params = {} - params = Params(task=_task, param_choice=_param_choice, model_choice=_model_choice) - params = params.get() - for _param in hyperparameters: - if _param.elem_id in params.keys(): - _training_params[_param.elem_id] = params_data[_param] - _training_params_md = json.loads(params_data[training_params_txt]) - if _param_choice == "autotrain": - if len(_training_params_md) > 0: - _training_params_md[0] = _training_params - _training_params_md = _training_params_md[:1] - else: - _training_params_md.append(_training_params) - else: - _training_params_md.append(_training_params) - params_df = pd.DataFrame(_training_params_md) - # remove hub_model column - if "hub_model" in params_df.columns: - params_df = params_df.drop(columns=["hub_model"]) - return [ - gr.DataFrame.update(value=params_df, visible=True), - gr.Textbox.update(value=json.dumps(_training_params_md), visible=False), - ] - - add_job_button.click( - _add_job, - inputs=set( - [task_type, param_choice, model_choice] + hyperparameters + [training_params_md, training_params_txt] - ), - outputs=[training_params_md, training_params_txt], - ) - col_map_components = [ - col_map_text, - col_map_target, - concept_token, - ] - training_data.change( - _update_col_map, - inputs=[training_data, task_type], - outputs=col_map_components, - ) - task_type.change( - _update_col_map, - inputs=[training_data, task_type], - outputs=col_map_components, - ) - estimate_costs_inputs = [ - training_data, - validation_data, - task_type, - user_token, - autotrain_username, - training_params_txt, - autotrain_backend, - ] - estimate_costs_outputs = [estimated_costs_md, estimated_costs_num] - training_data.change(_estimate_costs, inputs=estimate_costs_inputs, outputs=estimate_costs_outputs) - validation_data.change(_estimate_costs, inputs=estimate_costs_inputs, outputs=estimate_costs_outputs) - training_params_txt.change(_estimate_costs, inputs=estimate_costs_inputs, outputs=estimate_costs_outputs) - task_type.change(_estimate_costs, inputs=estimate_costs_inputs, outputs=estimate_costs_outputs) - add_job_button.click(_estimate_costs, inputs=estimate_costs_inputs, outputs=estimate_costs_outputs) - - create_project_button.click(disable_create_project_button, None, create_project_button).then( - _create_project, - inputs=[ - autotrain_username, - valid_can_pay, - project_name, - user_token, - task_type, - training_data, - validation_data, - col_map_text, - col_map_target, - concept_token, - training_params_txt, - hub_model, - estimated_costs_num, - autotrain_backend, - ], - outputs=final_output, - ) - - demo.load( - _update_project_name, - outputs=[project_name, create_project_button], - ) + params["model_choice"] = base_model + params["param_choice"] = "manual" + params["backend"] = hardware - return demo + jobs_df = pd.DataFrame([params]) + project = AutoTrainProject(dataset=dset, job_params=jobs_df) + ids = project.create() + return {"success": "true", "space_ids": ids} diff --git a/src/autotrain/backend.py b/src/autotrain/backend.py index 4e1d49d6bd..58a49ab188 100644 --- a/src/autotrain/backend.py +++ b/src/autotrain/backend.py @@ -75,13 +75,18 @@ def _llm_munge_data(params, username): else: valid_data_path = None if os.path.exists(train_data_path): + col_map = {"text": params.text_column} + if params.rejected_text_column is not None: + col_map["rejected_text"] = params.rejected_text_column + if params.prompt_column is not None: + col_map["prompt"] = params.prompt_column dset = AutoTrainDataset( train_data=[train_data_path], task="lm_training", token=params.token, project_name=params.project_name, username=username, - column_mapping={"text": params.text_column}, + column_mapping=col_map, valid_data=[valid_data_path] if valid_data_path is not None else None, percent_valid=None, # TODO: add to UI ) diff --git a/src/autotrain/dataset.py b/src/autotrain/dataset.py index 20db2a69eb..a3f99e2d0c 100644 --- a/src/autotrain/dataset.py +++ b/src/autotrain/dataset.py @@ -7,7 +7,7 @@ import pandas as pd from autotrain import logger -from autotrain.preprocessor.dreambooth import DreamboothPreprocessor +from autotrain.preprocessor.dreambooth import DreamboothPreprocessor, DreamboothPreprocessorV2 from autotrain.preprocessor.tabular import ( TabularBinaryClassificationPreprocessor, TabularMultiClassClassificationPreprocessor, @@ -53,6 +53,7 @@ class AutoTrainDreamboothDataset: token: str project_name: str username: str + use_v2: bool = False def __str__(self) -> str: info = f"Dataset: {self.project_name} ({self.task})\n" @@ -67,13 +68,22 @@ def num_samples(self): return len(self.concept_images) def prepare(self): - preprocessor = DreamboothPreprocessor( - concept_images=self.concept_images, - concept_name=self.concept_name, - token=self.token, - project_name=self.project_name, - username=self.username, - ) + if self.use_v2: + preprocessor = DreamboothPreprocessorV2( + concept_images=self.concept_images, + concept_name=self.concept_name, + token=self.token, + project_name=self.project_name, + username=self.username, + ) + else: + preprocessor = DreamboothPreprocessor( + concept_images=self.concept_images, + concept_name=self.concept_name, + token=self.token, + project_name=self.project_name, + username=self.username, + ) preprocessor.prepare() @@ -286,22 +296,14 @@ def prepare(self): preprocessor.prepare() elif self.task == "lm_training": - text_column = self.column_mapping.get("text", None) - if text_column is None: - prompt_column = self.column_mapping["prompt"] - response_column = self.column_mapping["response"] - else: - prompt_column = None - response_column = None - context_column = self.column_mapping.get("context", None) - prompt_start_column = self.column_mapping.get("prompt_start", None) + text_column = self.column_mapping["text"] + prompt_column = self.column_mapping.get("prompt") + rejected_text_column = self.column_mapping.get("rejected_text") preprocessor = LLMPreprocessor( train_data=self.train_df, text_column=text_column, prompt_column=prompt_column, - response_column=response_column, - context_column=context_column, - prompt_start_column=prompt_start_column, + rejected_text_column=rejected_text_column, username=self.username, project_name=self.project_name, valid_data=self.valid_df, diff --git a/src/autotrain/preprocessor/dreambooth.py b/src/autotrain/preprocessor/dreambooth.py index 72ebd5778c..f65a9d46d3 100644 --- a/src/autotrain/preprocessor/dreambooth.py +++ b/src/autotrain/preprocessor/dreambooth.py @@ -8,6 +8,63 @@ from autotrain import logger +@dataclass +class DreamboothPreprocessorV2: + concept_images: List[Any] + concept_name: str + username: str + project_name: str + token: str + + def __post_init__(self): + self.repo_name = f"{self.username}/autotrain-data-{self.project_name}" + try: + create_repo( + repo_id=self.repo_name, + repo_type="dataset", + token=self.token, + private=True, + exist_ok=False, + ) + except Exception: + logger.error("Error creating repo") + raise ValueError("Error creating repo") + + def _upload_concept_images(self, file, api): + logger.info(f"Uploading {file} to concept1") + api.upload_file( + path_or_fileobj=file.file.read(), + path_in_repo=f"concept1/{file.filename.split('/')[-1]}", + repo_id=self.repo_name, + repo_type="dataset", + token=self.token, + ) + + def _upload_concept_prompts(self, api): + _prompts = {} + _prompts["concept1"] = self.concept_name + + prompts = json.dumps(_prompts) + prompts = prompts.encode("utf-8") + prompts = io.BytesIO(prompts) + api.upload_file( + path_or_fileobj=prompts, + path_in_repo="prompts.json", + repo_id=self.repo_name, + repo_type="dataset", + token=self.token, + ) + + def prepare(self): + api = HfApi(token=self.token) + for _file in self.concept_images: + print(_file) + print(type(_file)) + self._upload_concept_images(_file, api) + + self._upload_concept_prompts(api) + + @dataclass class DreamboothPreprocessor: concept_images: List[Any] diff --git a/src/autotrain/preprocessor/tabular.py b/src/autotrain/preprocessor/tabular.py index 928df25d46..178962af94 100644 --- a/src/autotrain/preprocessor/tabular.py +++ b/src/autotrain/preprocessor/tabular.py @@ -121,6 +121,7 @@ class TabularMultiLabelClassificationPreprocessor: valid_data: Optional[pd.DataFrame] = None test_size: Optional[float] = 0.2 seed: Optional[int] = 42 + token: Optional[str] = None def __post_init__(self): # check if id_column and label_column are in train_data diff --git a/src/autotrain/preprocessor/text.py b/src/autotrain/preprocessor/text.py index b2cc995ac1..aaad9bf9b3 100644 --- a/src/autotrain/preprocessor/text.py +++ b/src/autotrain/preprocessor/text.py @@ -10,7 +10,7 @@ LLM_RESERVED_COLUMNS = [ "autotrain_prompt", "autotrain_context", - "autotrain_response", + "autotrain_rejected_text", "autotrain_prompt_start", ] @@ -130,33 +130,31 @@ class LLMPreprocessor: valid_data: Optional[pd.DataFrame] = None test_size: Optional[float] = 0.2 seed: Optional[int] = 42 - context_column: Optional[str] = None - prompt_start_column: Optional[str] = None text_column: Optional[str] = None prompt_column: Optional[str] = None - response_column: Optional[str] = None + rejected_text_column: Optional[str] = None def __post_init__(self): - # user can either provide text_column or prompt_column and response_column - if self.text_column is not None and (self.prompt_column is not None or self.response_column is not None): - raise ValueError("Please provide either text_column or prompt_column and response_column") + # user can either provide text_column or prompt_column and rejected_text_column + if self.text_column is not None and (self.prompt_column is not None or self.rejected_text_column is not None): + raise ValueError("Please provide either text_column or prompt_column and rejected_text_column") if self.text_column is not None: - # if text_column is provided, use it for prompt_column and response_column + # if text_column is provided, use it for prompt_column and rejected_text_column self.prompt_column = self.text_column - self.response_column = self.text_column + self.rejected_text_column = self.text_column - # check if text_column and response_column are in train_data + # check if text_column and rejected_text_column are in train_data if self.prompt_column not in self.train_data.columns: raise ValueError(f"{self.prompt_column} not in train data") - if self.response_column not in self.train_data.columns: - raise ValueError(f"{self.response_column} not in train data") - # check if text_column and response_column are in valid_data + if self.rejected_text_column not in self.train_data.columns: + raise ValueError(f"{self.rejected_text_column} not in train data") + # check if text_column and rejected_text_column are in valid_data if self.valid_data is not None: if self.prompt_column not in self.valid_data.columns: raise ValueError(f"{self.prompt_column} not in valid data") - if self.response_column not in self.valid_data.columns: - raise ValueError(f"{self.response_column} not in valid data") + if self.rejected_text_column not in self.valid_data.columns: + raise ValueError(f"{self.rejected_text_column} not in valid data") # make sure no reserved columns are in train_data or valid_data for column in RESERVED_COLUMNS + LLM_RESERVED_COLUMNS: @@ -194,25 +192,11 @@ def prepare_columns(self, train_df, valid_df): train_df.loc[:, "autotrain_prompt"] = train_df[self.prompt_column] valid_df.loc[:, "autotrain_prompt"] = valid_df[self.prompt_column] - train_df.loc[:, "autotrain_response"] = train_df[self.response_column] - valid_df.loc[:, "autotrain_response"] = valid_df[self.response_column] + train_df.loc[:, "autotrain_rejected_text"] = train_df[self.rejected_text_column] + valid_df.loc[:, "autotrain_rejected_text"] = valid_df[self.rejected_text_column] - train_df = train_df.drop(columns=[self.prompt_column, self.response_column]) - valid_df = valid_df.drop(columns=[self.prompt_column, self.response_column]) - - if self.context_column is not None: - train_df.loc[:, "autotrain_context"] = train_df[self.context_column] - valid_df.loc[:, "autotrain_context"] = valid_df[self.context_column] - - train_df = train_df.drop(columns=[self.context_column]) - valid_df = valid_df.drop(columns=[self.context_column]) - - if self.prompt_start_column is not None: - train_df.loc[:, "autotrain_prompt_start"] = train_df[self.prompt_start_column] - valid_df.loc[:, "autotrain_prompt_start"] = valid_df[self.prompt_start_column] - - train_df = train_df.drop(columns=[self.prompt_start_column]) - valid_df = valid_df.drop(columns=[self.prompt_start_column]) + train_df = train_df.drop(columns=[self.prompt_column, self.rejected_text_column]) + valid_df = valid_df.drop(columns=[self.prompt_column, self.rejected_text_column]) return train_df, valid_df diff --git a/src/autotrain/project.py b/src/autotrain/project.py index efa6745d4b..82a7c64de8 100644 --- a/src/autotrain/project.py +++ b/src/autotrain/project.py @@ -3,24 +3,19 @@ """ import json -import os -import time from dataclasses import dataclass -from typing import Dict, List, Optional, Union +from typing import Union import pandas as pd -from codecarbon import EmissionsTracker from autotrain import logger from autotrain.backend import SpaceRunner from autotrain.dataset import AutoTrainDataset, AutoTrainDreamboothDataset, AutoTrainImageClassificationDataset -from autotrain.languages import SUPPORTED_LANGUAGES from autotrain.tasks import TASKS from autotrain.trainers.clm.params import LLMTrainingParams from autotrain.trainers.dreambooth.params import DreamBoothTrainingParams from autotrain.trainers.tabular.params import TabularParams from autotrain.trainers.text_classification.params import TextClassificationParams -from autotrain.utils import http_get, http_post @dataclass @@ -180,191 +175,3 @@ def create(self): raise NotImplementedError if self.backend in self.spaces_backends: return self.create_spaces() - - -@dataclass -class Project: - dataset: Union[AutoTrainDataset, AutoTrainDreamboothDataset, AutoTrainImageClassificationDataset] - param_choice: Optional[str] = "autotrain" - hub_model: Optional[str] = None - job_params: Optional[List[Dict[str, str]]] = None - - def __post_init__(self): - self.token = self.dataset.token - self.name = self.dataset.project_name - self.username = self.dataset.username - self.task = self.dataset.task - - self.param_choice = self.param_choice.lower() - - if self.hub_model is not None: - if len(self.hub_model) == 0: - self.hub_model = None - - if self.job_params is None: - self.job_params = [] - - logger.info(f"๐Ÿš€๐Ÿš€๐Ÿš€ Creating project {self.name}, task: {self.task}") - logger.info(f"๐Ÿš€ Using username: {self.username}") - logger.info(f"๐Ÿš€ Using param_choice: {self.param_choice}") - logger.info(f"๐Ÿš€ Using hub_model: {self.hub_model}") - logger.info(f"๐Ÿš€ Using job_params: {self.job_params}") - - if self.token is None: - raise ValueError("โŒ Please login using `huggingface-cli login`") - - if self.hub_model is not None and len(self.job_params) == 0: - raise ValueError("โŒ Job parameters are required when hub model is specified.") - - if self.hub_model is None and len(self.job_params) > 1: - raise ValueError("โŒ Only one job parameter is allowed in AutoTrain mode.") - - if self.param_choice == "autotrain": - if "source_language" in self.job_params[0] and "target_language" not in self.job_params[0]: - self.language = self.job_params[0]["source_language"] - # remove source language from job params - self.job_params[0].pop("source_language") - elif "source_language" in self.job_params[0] and "target_language" in self.job_params[0]: - self.language = f'{self.job_params[0]["target_language"]}2{self.job_params[0]["source_language"]}' - # remove source and target language from job params - self.job_params[0].pop("source_language") - self.job_params[0].pop("target_language") - else: - self.language = "unk" - - if "num_models" in self.job_params[0]: - self.max_models = self.job_params[0]["num_models"] - self.job_params[0].pop("num_models") - elif "num_models" not in self.job_params[0] and "source_language" in self.job_params[0]: - raise ValueError("โŒ Please specify num_models in job_params when using AutoTrain model") - else: - self.language = "unk" - self.max_models = len(self.job_params) - - def create_local(self, payload): - from autotrain.trainers.dreambooth import train_ui as train_dreambooth - from autotrain.trainers.image_classification import train as train_image_classification - from autotrain.trainers.lm_trainer import train as train_lm - from autotrain.trainers.text_classification import train as train_text_classification - - # check if training tracker file exists in /tmp/ - if os.path.exists(os.path.join("/tmp", "training")): - raise ValueError("โŒ Another training job is already running in this workspace.") - - if len(payload["config"]["params"]) > 1: - raise ValueError("โŒ Only one job parameter is allowed in spaces/local mode.") - - model_path = os.path.join("/tmp/model", payload["proj_name"]) - os.makedirs(model_path, exist_ok=True) - - co2_tracker = EmissionsTracker(save_to_file=False) - co2_tracker.start() - # create a training tracker file in /tmp/, using touch - with open(os.path.join("/tmp", "training"), "w") as f: - f.write("training") - - if payload["task"] in [1, 2]: - _ = train_text_classification( - co2_tracker=co2_tracker, - payload=payload, - huggingface_token=self.token, - model_path=model_path, - ) - elif payload["task"] in [17, 18]: - _ = train_image_classification( - co2_tracker=co2_tracker, - payload=payload, - huggingface_token=self.token, - model_path=model_path, - ) - elif payload["task"] == 25: - _ = train_dreambooth( - co2_tracker=co2_tracker, - payload=payload, - huggingface_token=self.token, - model_path=model_path, - ) - elif payload["task"] == 9: - _ = train_lm( - co2_tracker=co2_tracker, - payload=payload, - huggingface_token=self.token, - model_path=model_path, - ) - else: - raise NotImplementedError - - # remove the training tracker file in /tmp/, using rm - os.remove(os.path.join("/tmp", "training")) - - def create(self, local=False): - """Create a project and return it""" - logger.info(f"๐Ÿš€ Creating project {self.name}, task: {self.task}") - task_id = TASKS.get(self.task) - if task_id is None: - raise ValueError(f"โŒ Invalid task selected. Please choose one of {TASKS.keys()}") - language = str(self.language).strip().lower() - if task_id is None: - raise ValueError(f"โŒ Invalid task specified. Please choose one of {list(TASKS.keys())}") - - if self.hub_model is not None: - language = "unk" - - if language not in SUPPORTED_LANGUAGES: - raise ValueError("โŒ Invalid language. Please check supported languages in AutoTrain documentation.") - - payload = { - "username": self.username, - "proj_name": self.name, - "task": task_id, - "config": { - "advanced": True, - "autotrain": True if self.param_choice == "autotrain" else False, - "language": language, - "max_models": self.max_models, - "hub_model": self.hub_model, - "params": self.job_params, - }, - } - logger.info(f"๐Ÿš€ Creating project with payload: {payload}") - - if local is True: - return self.create_local(payload=payload) - - logger.info(f"๐Ÿš€ Creating project with payload: {payload}") - json_resp = http_post(path="/projects/create", payload=payload, token=self.token).json() - proj_name = json_resp["proj_name"] - proj_id = json_resp["id"] - created = json_resp["created"] - - if created is True: - return proj_id - raise ValueError(f"โŒ Project with name {proj_name} already exists.") - - def approve(self, project_id): - # Process data - _ = http_post( - path=f"/projects/{project_id}/data/start_processing", - token=self.token, - ).json() - - logger.info("โณ Waiting for data processing to complete ...") - is_data_processing_success = False - while is_data_processing_success is not True: - project_status = http_get( - path=f"/projects/{project_id}", - token=self.token, - ).json() - # See database.database.enums.ProjectStatus for definitions of `status` - if project_status["status"] == 3: - is_data_processing_success = True - logger.info("โœ… Data processing complete!") - - time.sleep(3) - - logger.info(f"๐Ÿš€ Approving project # {project_id}") - # Approve training job - _ = http_post( - path=f"/projects/{project_id}/start_training", - token=self.token, - ).json() diff --git a/src/autotrain/trainers/clm/__main__.py b/src/autotrain/trainers/clm/__main__.py index 1810e26a39..9aedaaddd6 100644 --- a/src/autotrain/trainers/clm/__main__.py +++ b/src/autotrain/trainers/clm/__main__.py @@ -69,6 +69,7 @@ def train(config): if config.trainer == "dpo": if not (config.prompt_text_column == "prompt" and config.prompt_text_column in train_data.column_names): train_data = train_data.rename_column(config.prompt_text_column, "prompt") + if config.valid_split is not None: valid_path = f"{config.data_path}/{config.valid_split}.csv" if os.path.exists(valid_path): diff --git a/src/autotrain/trainers/clm/params.py b/src/autotrain/trainers/clm/params.py index da75e439f3..83914b66ea 100644 --- a/src/autotrain/trainers/clm/params.py +++ b/src/autotrain/trainers/clm/params.py @@ -1,12 +1,11 @@ -import os from typing import List, Union -from pydantic import BaseModel, Field +from pydantic import Field -from autotrain import logger +from autotrain.trainers.common import AutoTrainParams -class LLMTrainingParams(BaseModel): +class LLMTrainingParams(AutoTrainParams): model: str = Field("gpt2", title="Model name") data_path: str = Field("data", title="Data path") project_name: str = Field("Project Name", title="Output directory") @@ -52,31 +51,3 @@ class LLMTrainingParams(BaseModel): model_ref: str = Field(None, title="Reference, for DPO trainer") dpo_beta: float = Field(0.1, title="Beta for DPO trainer") prompt_text_column: str = Field(None, title="Prompt text column") - - def save(self, output_dir): - os.makedirs(output_dir, exist_ok=True) - path = os.path.join(output_dir, "training_params.json") - # save formatted json - with open(path, "w") as f: - f.write(self.json(indent=4)) - - def __str__(self): - data = self.dict() - data["token"] = "*****" if data.get("token") else None - return str(data) - - def __init__(self, **data): - super().__init__(**data) - - # Parameters not supplied by the user - defaults = {f.name for f in self.__fields__.values() if f.default == self.__dict__[f.name]} - supplied = set(data.keys()) - not_supplied = defaults - supplied - if not_supplied: - logger.warning(f"Parameters not supplied by user and set to default: {', '.join(not_supplied)}") - - # Parameters that were supplied but not used - # This is a naive implementation. It might catch some internal Pydantic params. - unused = supplied - set(self.__fields__) - if unused: - logger.warning(f"Parameters supplied but not used: {', '.join(unused)}") diff --git a/src/autotrain/trainers/clm/utils.py b/src/autotrain/trainers/clm/utils.py index 9f7bbdc7bf..c07379c641 100644 --- a/src/autotrain/trainers/clm/utils.py +++ b/src/autotrain/trainers/clm/utils.py @@ -54,6 +54,8 @@ def preprocess_reward(examples, tokenizer): def get_target_modules(config): if config.target_modules is None: return TARGET_MODULES.get(config.model) + elif config.target_modules.strip() == "": + return TARGET_MODULES.get(config.model) return config.target_modules.split(",") diff --git a/src/autotrain/trainers/common.py b/src/autotrain/trainers/common.py index d641e300d6..251608bc3d 100644 --- a/src/autotrain/trainers/common.py +++ b/src/autotrain/trainers/common.py @@ -1,3 +1,6 @@ +""" +Common classes and functions for all trainers. +""" import os from pydantic import BaseModel @@ -6,23 +9,39 @@ class AutoTrainParams(BaseModel): + """ + Base class for all AutoTrain parameters. + """ + + class Config: + protected_namespaces = () + def save(self, output_dir): + """ + Save parameters to a json file. + """ os.makedirs(output_dir, exist_ok=True) path = os.path.join(output_dir, "training_params.json") # save formatted json - with open(path, "w") as f: - f.write(self.json(indent=4)) + with open(path, "w", encoding="utf-8") as f: + f.write(self.model_dump_json(indent=4)) def __str__(self): - data = self.dict() + """ + String representation of the parameters. + """ + data = self.model_dump() data["token"] = "*****" if data.get("token") else None return str(data) def __init__(self, **data): + """ + Initialize the parameters, check for unused/extra parameters and warn the user. + """ super().__init__(**data) # Parameters not supplied by the user - defaults = {f.name for f in self.__fields__.values() if f.default == self.__dict__[f.name]} + defaults = set(self.model_fields.keys()) supplied = set(data.keys()) not_supplied = defaults - supplied if not_supplied: @@ -30,6 +49,6 @@ def __init__(self, **data): # Parameters that were supplied but not used # This is a naive implementation. It might catch some internal Pydantic params. - unused = supplied - set(self.__fields__) + unused = supplied - set(self.model_fields) if unused: logger.warning(f"Parameters supplied but not used: {', '.join(unused)}") diff --git a/src/autotrain/trainers/image_classification/__main__.py b/src/autotrain/trainers/image_classification/__main__.py index 569929595c..48b66123df 100644 --- a/src/autotrain/trainers/image_classification/__main__.py +++ b/src/autotrain/trainers/image_classification/__main__.py @@ -62,14 +62,14 @@ def train(config): f"Number of classes in train and valid are not the same. Training has {num_classes} and valid has {num_classes_valid}" ) - model_config = AutoConfig.from_pretrained(config.model_name, num_labels=num_classes) + model_config = AutoConfig.from_pretrained(config.model, num_labels=num_classes) model_config._num_labels = len(label2id) model_config.label2id = label2id model_config.id2label = {v: k for k, v in label2id.items()} try: model = AutoModelForImageClassification.from_pretrained( - config.model_name, + config.model, config=model_config, trust_remote_code=True, token=config.token, @@ -77,7 +77,7 @@ def train(config): ) except OSError: model = AutoModelForImageClassification.from_pretrained( - config.model_name, + config.model, config=model_config, from_tf=True, trust_remote_code=True, @@ -85,7 +85,7 @@ def train(config): ignore_mismatched_sizes=True, ) - image_processor = AutoImageProcessor.from_pretrained(config.model_name, token=config.token) + image_processor = AutoImageProcessor.from_pretrained(config.model, token=config.token) train_data, valid_data = utils.process_data(train_data, valid_data, image_processor, config) if config.logging_steps == -1: diff --git a/src/autotrain/trainers/image_classification/params.py b/src/autotrain/trainers/image_classification/params.py index 19b77e4635..70c336dc91 100644 --- a/src/autotrain/trainers/image_classification/params.py +++ b/src/autotrain/trainers/image_classification/params.py @@ -1,11 +1,11 @@ -import os +from pydantic import Field -from pydantic import BaseModel, Field +from autotrain.trainers.common import AutoTrainParams -class ImageClassificationParams(BaseModel): +class ImageClassificationParams(AutoTrainParams): data_path: str = Field(None, title="Data path") - model_name: str = Field("bert-base-uncased", title="Model name") + model: str = Field("google/vit-base-patch16-224", title="Model name") lr: float = Field(5e-5, title="Learning rate") epochs: int = Field(3, title="Number of training epochs") batch_size: int = Field(8, title="Training batch size") @@ -31,15 +31,3 @@ class ImageClassificationParams(BaseModel): image_column: str = Field("image", title="Image column") target_column: str = Field("target", title="Target column") log: str = Field("none", title="Logging using experiment tracking") - - def __str__(self): - data = self.dict() - data["token"] = "*****" if data.get("token") else None - return str(data) - - def save(self, output_dir): - os.makedirs(output_dir, exist_ok=True) - path = os.path.join(output_dir, "training_params.json") - # save formatted json - with open(path, "w") as f: - f.write(self.json(indent=4)) diff --git a/src/autotrain/trainers/text_classification/params.py b/src/autotrain/trainers/text_classification/params.py index 83b26e150b..9683766cc6 100644 --- a/src/autotrain/trainers/text_classification/params.py +++ b/src/autotrain/trainers/text_classification/params.py @@ -1,11 +1,9 @@ -import os +from pydantic import Field -from pydantic import BaseModel, Field +from autotrain.trainers.common import AutoTrainParams -from autotrain import logger - -class TextClassificationParams(BaseModel): +class TextClassificationParams(AutoTrainParams): data_path: str = Field(None, title="Data path") model: str = Field("bert-base-uncased", title="Model name") lr: float = Field(5e-5, title="Learning rate") @@ -35,31 +33,3 @@ class TextClassificationParams(BaseModel): evaluation_strategy: str = Field("epoch", title="Evaluation strategy") username: str = Field(None, title="Hugging Face Username") log: str = Field("none", title="Logging using experiment tracking") - - def __str__(self): - data = self.dict() - data["token"] = "*****" if data.get("token") else None - return str(data) - - def save(self, output_dir): - os.makedirs(output_dir, exist_ok=True) - path = os.path.join(output_dir, "training_params.json") - # save formatted json - with open(path, "w") as f: - f.write(self.json(indent=4)) - - def __init__(self, **data): - super().__init__(**data) - - # Parameters not supplied by the user - defaults = {f.name for f in self.__fields__.values() if f.default == self.__dict__[f.name]} - supplied = set(data.keys()) - not_supplied = defaults - supplied - if not_supplied: - logger.warning(f"Parameters not supplied by user and set to default: {', '.join(not_supplied)}") - - # Parameters that were supplied but not used - # This is a naive implementation. It might catch some internal Pydantic params. - unused = supplied - set(self.__fields__) - if unused: - logger.warning(f"Parameters supplied but not used: {', '.join(unused)}") diff --git a/static/logo.png b/static/logo.png new file mode 100644 index 0000000000..02cbc2b4ce Binary files /dev/null and b/static/logo.png differ diff --git a/templates/error.html b/templates/error.html new file mode 100644 index 0000000000..07afb3ed67 --- /dev/null +++ b/templates/error.html @@ -0,0 +1,21 @@ + + + + + + + + + +
+
+ AutoTrain +
+
+ +
+

Error

+

Either HF_TOKEN or HF_USERNAME environment variable is not set.

+
+ + \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000000..be600a9078 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,499 @@ + + + + + + + + + + +
+
+ AutoTrain +
+
+ +
+
+ +
+
+ + +
+
+ + +
+
+
+
+
+ + +
+
+ + +
+
+
+
+
+
    + + +
+
+
+ + +
+
+ +
+ + +
+
+ +
+
+
+
+
+ + + + +
+ + + + + + + + + + + + + + \ No newline at end of file