Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
abhishekkrthakur committed Nov 27, 2024
1 parent e10241b commit e13b01c
Show file tree
Hide file tree
Showing 16 changed files with 67 additions and 13 deletions.
6 changes: 3 additions & 3 deletions configs/token_classification/local_dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ backend: local
data:
path: data/ # this must be the path to the directory containing the train and valid files
train_split: train # this must be either train.json
valid_split: valid # this must be either valid.json, can also be set to null
valid_split: test # this must be either valid.json, can also be set to null
column_mapping:
text_column: text # this must be the name of the column containing the text
target_column: label # this must be the name of the column containing the target
tokens_column: tokens # this must be the name of the column containing the text
tags_column: tags # this must be the name of the column containing the target

params:
max_seq_length: 512
Expand Down
12 changes: 8 additions & 4 deletions docs/source/tasks/token_classification.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ The data should be in the following CSV format:

```csv
tokens,tags
"['I', 'love', 'Paris']", "['O', 'O', 'B-LOC']"
"['I', 'live', 'in', 'New', 'York']", "['O', 'O', 'O', 'B-LOC', 'I-LOC']"
"['I', 'love', 'Paris']","['O', 'O', 'B-LOC']"
"['I', 'live', 'in', 'New', 'York']","['O', 'O', 'O', 'B-LOC', 'I-LOC']"
.
.
.
Expand All @@ -21,8 +21,8 @@ tokens,tags
or you can also use JSONL format:

```json
{"tokens": ["I", "love", "Paris"], "tags": ["O", "O", "B-LOC"]}
{"tokens": ["I", "live", "in", "New", "York"], "tags": ["O", "O", "O", "B-LOC", "I-LOC"]}
{"tokens": ["I", "love", "Paris"],"tags": ["O", "O", "B-LOC"]}
{"tokens": ["I", "live", "in", "New", "York"],"tags": ["O", "O", "O", "B-LOC", "I-LOC"]}
.
.
.
Expand Down Expand Up @@ -51,6 +51,10 @@ for chunk in pd.read_csv('example.csv', chunksize=chunk_size):
i += 1
```


Sample dataset from HuggingFace Hub: [conll2003](https://huggingface.co/datasets/eriktks/conll2003)


## Columns

Your CSV/JSONL dataset must have two columns: `tokens` and `tags`.
Expand Down
4 changes: 2 additions & 2 deletions src/autotrain/app/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,8 @@ def _munge_params_token_clf(self):
_params["tags_column"] = "autotrain_label"
_params["valid_split"] = "validation"
else:
_params["tokens_column"] = self.column_mapping.get("text" if not self.api else "tokens_column", "text")
_params["tags_column"] = self.column_mapping.get("label" if not self.api else "tags_column", "label")
_params["tokens_column"] = self.column_mapping.get("tokens" if not self.api else "tokens_column", "tokens")
_params["tags_column"] = self.column_mapping.get("tags" if not self.api else "tags_column", "tags")
_params["train_split"] = self.train_split
_params["valid_split"] = self.valid_split

Expand Down
2 changes: 1 addition & 1 deletion src/autotrain/app/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
fieldNames = ['text', 'target'];
break;
case 'token-classification':
fields = ['text', 'label'];
fields = ['tokens', 'tags'];
fieldNames = ['tokens', 'tags'];
break;
case 'dreambooth':
Expand Down
4 changes: 2 additions & 2 deletions src/autotrain/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,8 @@ def token_clf_munge_data(params, local):
)
params.data_path = dset.prepare()
params.valid_split = "validation"
params.text_column = "autotrain_text"
params.target_column = "autotrain_label"
params.tokens_column = "autotrain_text"
params.tags_column = "autotrain_label"
return params


Expand Down
4 changes: 4 additions & 0 deletions src/autotrain/trainers/clm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,12 +494,14 @@ def process_input_data(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
# rename columns for reward trainer
if config.trainer in ("dpo", "reward", "orpo"):
Expand All @@ -522,12 +524,14 @@ def process_input_data(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.trainer in ("dpo", "reward", "orpo"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.valid_split is not None:
Expand All @@ -79,12 +81,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

logger.info(train_data)
Expand Down
4 changes: 4 additions & 0 deletions src/autotrain/trainers/image_classification/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.valid_split is not None:
Expand All @@ -71,12 +73,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

logger.info(f"Train data: {train_data}")
Expand Down
4 changes: 4 additions & 0 deletions src/autotrain/trainers/image_regression/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.valid_split is not None:
Expand All @@ -71,12 +73,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

logger.info(f"Train data: {train_data}")
Expand Down
4 changes: 4 additions & 0 deletions src/autotrain/trainers/object_detection/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.valid_split is not None:
Expand All @@ -72,12 +74,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

logger.info(f"Train data: {train_data}")
Expand Down
4 changes: 4 additions & 0 deletions src/autotrain/trainers/sent_transformers/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.valid_split is not None:
Expand All @@ -74,12 +76,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

num_classes = None
Expand Down
4 changes: 4 additions & 0 deletions src/autotrain/trainers/seq2seq/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.valid_split is not None:
Expand All @@ -82,12 +84,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

tokenizer = AutoTokenizer.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE)
Expand Down
12 changes: 11 additions & 1 deletion src/autotrain/trainers/tabular/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@
from sklearn.compose import ColumnTransformer

from autotrain import logger
from autotrain.trainers.common import monitor, pause_space, remove_autotrain_data, save_training_params
from autotrain.trainers.common import (
ALLOW_REMOTE_CODE,
monitor,
pause_space,
remove_autotrain_data,
save_training_params,
)
from autotrain.trainers.tabular import utils
from autotrain.trainers.tabular.params import TabularParams

Expand Down Expand Up @@ -187,12 +193,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
train_data = train_data.to_pandas()

Expand All @@ -208,12 +216,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
valid_data = valid_data.to_pandas()

Expand Down
4 changes: 4 additions & 0 deletions src/autotrain/trainers/text_classification/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.valid_split is not None:
Expand All @@ -77,12 +79,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

classes = train_data.features[config.target_column].names
Expand Down
4 changes: 4 additions & 0 deletions src/autotrain/trainers/text_regression/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

if config.valid_split is not None:
Expand All @@ -77,12 +79,14 @@ def train(config):
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)

model_config = AutoConfig.from_pretrained(
Expand Down
Loading

0 comments on commit e13b01c

Please sign in to comment.