Skip to content

Commit

Permalink
fix char-based training preproc
Browse files Browse the repository at this point in the history
  • Loading branch information
markus583 committed Jan 2, 2024
1 parent 78e2a43 commit 8462338
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions wtpsplit/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def maybe_pad(text):
if not args.use_subwords:
lang_texts = [
maybe_pad(text)
for text, lang in zip(examples[args.text_column], examples["lang"])
for text, lang in zip(examples["input_ids"], examples["lang"])
if lang == current_lang
]
else:
Expand Down Expand Up @@ -520,7 +520,7 @@ def maybe_pad(text):
else:
# this is no longer used and would cause an error otherwise
with training_args.main_process_first():
dataset = dataset.remove_columns([args.text_column])
dataset = dataset.rename_column(args.text_column, "input_ids")
logger.warning(f"Tokenized {split} dataset.")

if split == "train":
Expand Down

0 comments on commit 8462338

Please sign in to comment.