From e13b01cae8d6268ef54c83087ae7780327c0bf4a Mon Sep 17 00:00:00 2001 From: abhishekkrthakur Date: Wed, 27 Nov 2024 10:48:13 +0100 Subject: [PATCH] fixes --- configs/token_classification/local_dataset.yml | 6 +++--- docs/source/tasks/token_classification.mdx | 12 ++++++++---- src/autotrain/app/params.py | 4 ++-- src/autotrain/app/templates/index.html | 2 +- src/autotrain/project.py | 4 ++-- src/autotrain/trainers/clm/utils.py | 4 ++++ .../extractive_question_answering/__main__.py | 4 ++++ .../trainers/image_classification/__main__.py | 4 ++++ src/autotrain/trainers/image_regression/__main__.py | 4 ++++ src/autotrain/trainers/object_detection/__main__.py | 4 ++++ src/autotrain/trainers/sent_transformers/__main__.py | 4 ++++ src/autotrain/trainers/seq2seq/__main__.py | 4 ++++ src/autotrain/trainers/tabular/__main__.py | 12 +++++++++++- .../trainers/text_classification/__main__.py | 4 ++++ src/autotrain/trainers/text_regression/__main__.py | 4 ++++ .../trainers/token_classification/__main__.py | 4 ++++ 16 files changed, 67 insertions(+), 13 deletions(-) diff --git a/configs/token_classification/local_dataset.yml b/configs/token_classification/local_dataset.yml index b03717298b..97ae844057 100644 --- a/configs/token_classification/local_dataset.yml +++ b/configs/token_classification/local_dataset.yml @@ -7,10 +7,10 @@ backend: local data: path: data/ # this must be the path to the directory containing the train and valid files train_split: train # this must be either train.json - valid_split: valid # this must be either valid.json, can also be set to null + valid_split: test # this must be either valid.json, can also be set to null column_mapping: - text_column: text # this must be the name of the column containing the text - target_column: label # this must be the name of the column containing the target + tokens_column: tokens # this must be the name of the column containing the text + tags_column: tags # this must be the name of the column containing the target params: max_seq_length: 512 diff --git a/docs/source/tasks/token_classification.mdx b/docs/source/tasks/token_classification.mdx index d4a2b46db3..a4b064e16b 100644 --- a/docs/source/tasks/token_classification.mdx +++ b/docs/source/tasks/token_classification.mdx @@ -11,8 +11,8 @@ The data should be in the following CSV format: ```csv tokens,tags -"['I', 'love', 'Paris']", "['O', 'O', 'B-LOC']" -"['I', 'live', 'in', 'New', 'York']", "['O', 'O', 'O', 'B-LOC', 'I-LOC']" +"['I', 'love', 'Paris']","['O', 'O', 'B-LOC']" +"['I', 'live', 'in', 'New', 'York']","['O', 'O', 'O', 'B-LOC', 'I-LOC']" . . . @@ -21,8 +21,8 @@ tokens,tags or you can also use JSONL format: ```json -{"tokens": ["I", "love", "Paris"], "tags": ["O", "O", "B-LOC"]} -{"tokens": ["I", "live", "in", "New", "York"], "tags": ["O", "O", "O", "B-LOC", "I-LOC"]} +{"tokens": ["I", "love", "Paris"],"tags": ["O", "O", "B-LOC"]} +{"tokens": ["I", "live", "in", "New", "York"],"tags": ["O", "O", "O", "B-LOC", "I-LOC"]} . . . @@ -51,6 +51,10 @@ for chunk in pd.read_csv('example.csv', chunksize=chunk_size): i += 1 ``` + +Sample dataset from HuggingFace Hub: [conll2003](https://huggingface.co/datasets/eriktks/conll2003) + + ## Columns Your CSV/JSONL dataset must have two columns: `tokens` and `tags`. diff --git a/src/autotrain/app/params.py b/src/autotrain/app/params.py index 7de8a5e11b..41b569b39a 100644 --- a/src/autotrain/app/params.py +++ b/src/autotrain/app/params.py @@ -397,8 +397,8 @@ def _munge_params_token_clf(self): _params["tags_column"] = "autotrain_label" _params["valid_split"] = "validation" else: - _params["tokens_column"] = self.column_mapping.get("text" if not self.api else "tokens_column", "text") - _params["tags_column"] = self.column_mapping.get("label" if not self.api else "tags_column", "label") + _params["tokens_column"] = self.column_mapping.get("tokens" if not self.api else "tokens_column", "tokens") + _params["tags_column"] = self.column_mapping.get("tags" if not self.api else "tags_column", "tags") _params["train_split"] = self.train_split _params["valid_split"] = self.valid_split diff --git a/src/autotrain/app/templates/index.html b/src/autotrain/app/templates/index.html index 00501202cf..437076e1c7 100644 --- a/src/autotrain/app/templates/index.html +++ b/src/autotrain/app/templates/index.html @@ -73,7 +73,7 @@ fieldNames = ['text', 'target']; break; case 'token-classification': - fields = ['text', 'label']; + fields = ['tokens', 'tags']; fieldNames = ['tokens', 'tags']; break; case 'dreambooth': diff --git a/src/autotrain/project.py b/src/autotrain/project.py index 121c8fd1e2..528a4aa7cd 100644 --- a/src/autotrain/project.py +++ b/src/autotrain/project.py @@ -262,8 +262,8 @@ def token_clf_munge_data(params, local): ) params.data_path = dset.prepare() params.valid_split = "validation" - params.text_column = "autotrain_text" - params.target_column = "autotrain_label" + params.tokens_column = "autotrain_text" + params.tags_column = "autotrain_label" return params diff --git a/src/autotrain/trainers/clm/utils.py b/src/autotrain/trainers/clm/utils.py index 78ef1df021..5d000889a8 100644 --- a/src/autotrain/trainers/clm/utils.py +++ b/src/autotrain/trainers/clm/utils.py @@ -494,12 +494,14 @@ def process_input_data(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) # rename columns for reward trainer if config.trainer in ("dpo", "reward", "orpo"): @@ -522,12 +524,14 @@ def process_input_data(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.trainer in ("dpo", "reward", "orpo"): diff --git a/src/autotrain/trainers/extractive_question_answering/__main__.py b/src/autotrain/trainers/extractive_question_answering/__main__.py index 46002769b2..42c3646949 100644 --- a/src/autotrain/trainers/extractive_question_answering/__main__.py +++ b/src/autotrain/trainers/extractive_question_answering/__main__.py @@ -59,12 +59,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -79,12 +81,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) logger.info(train_data) diff --git a/src/autotrain/trainers/image_classification/__main__.py b/src/autotrain/trainers/image_classification/__main__.py index 3b399900a9..8d29c1c74a 100644 --- a/src/autotrain/trainers/image_classification/__main__.py +++ b/src/autotrain/trainers/image_classification/__main__.py @@ -52,12 +52,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -71,12 +73,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) logger.info(f"Train data: {train_data}") diff --git a/src/autotrain/trainers/image_regression/__main__.py b/src/autotrain/trainers/image_regression/__main__.py index 388eee5362..a4db2e06de 100644 --- a/src/autotrain/trainers/image_regression/__main__.py +++ b/src/autotrain/trainers/image_regression/__main__.py @@ -52,12 +52,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -71,12 +73,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) logger.info(f"Train data: {train_data}") diff --git a/src/autotrain/trainers/object_detection/__main__.py b/src/autotrain/trainers/object_detection/__main__.py index faadf462e7..bd6a7b4605 100644 --- a/src/autotrain/trainers/object_detection/__main__.py +++ b/src/autotrain/trainers/object_detection/__main__.py @@ -53,12 +53,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -72,12 +74,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) logger.info(f"Train data: {train_data}") diff --git a/src/autotrain/trainers/sent_transformers/__main__.py b/src/autotrain/trainers/sent_transformers/__main__.py index 76bf5f7ea0..2b3832a540 100644 --- a/src/autotrain/trainers/sent_transformers/__main__.py +++ b/src/autotrain/trainers/sent_transformers/__main__.py @@ -54,12 +54,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -74,12 +76,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) num_classes = None diff --git a/src/autotrain/trainers/seq2seq/__main__.py b/src/autotrain/trainers/seq2seq/__main__.py index f7c7f94a9b..390e70927f 100644 --- a/src/autotrain/trainers/seq2seq/__main__.py +++ b/src/autotrain/trainers/seq2seq/__main__.py @@ -62,12 +62,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -82,12 +84,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) tokenizer = AutoTokenizer.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE) diff --git a/src/autotrain/trainers/tabular/__main__.py b/src/autotrain/trainers/tabular/__main__.py index 862eb222dd..4474f8780e 100644 --- a/src/autotrain/trainers/tabular/__main__.py +++ b/src/autotrain/trainers/tabular/__main__.py @@ -13,7 +13,13 @@ from sklearn.compose import ColumnTransformer from autotrain import logger -from autotrain.trainers.common import monitor, pause_space, remove_autotrain_data, save_training_params +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) from autotrain.trainers.tabular import utils from autotrain.trainers.tabular.params import TabularParams @@ -187,12 +193,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) train_data = train_data.to_pandas() @@ -208,12 +216,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) valid_data = valid_data.to_pandas() diff --git a/src/autotrain/trainers/text_classification/__main__.py b/src/autotrain/trainers/text_classification/__main__.py index 5ce3918e87..5b2cf67af5 100644 --- a/src/autotrain/trainers/text_classification/__main__.py +++ b/src/autotrain/trainers/text_classification/__main__.py @@ -57,12 +57,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -77,12 +79,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) classes = train_data.features[config.target_column].names diff --git a/src/autotrain/trainers/text_regression/__main__.py b/src/autotrain/trainers/text_regression/__main__.py index 4d108bf7f4..3425d3e56a 100644 --- a/src/autotrain/trainers/text_regression/__main__.py +++ b/src/autotrain/trainers/text_regression/__main__.py @@ -57,12 +57,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -77,12 +79,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) model_config = AutoConfig.from_pretrained( diff --git a/src/autotrain/trainers/token_classification/__main__.py b/src/autotrain/trainers/token_classification/__main__.py index 1a9ed714b1..ad5b461c30 100644 --- a/src/autotrain/trainers/token_classification/__main__.py +++ b/src/autotrain/trainers/token_classification/__main__.py @@ -58,12 +58,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: train_data = load_dataset( config.data_path, split=config.train_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) if config.valid_split is not None: @@ -78,12 +80,14 @@ def train(config): name=dataset_config_name, split=split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) else: valid_data = load_dataset( config.data_path, split=config.valid_split, token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, ) label_list = train_data.features[config.tags_column].feature.names