fix char-based training preproc

segment-any-text · Jan 2, 2024 · 8462338 · 8462338
1 parent 78e2a43
commit 8462338
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/wtpsplit/train/train.py b/wtpsplit/train/train.py
@@ -412,7 +412,7 @@ def maybe_pad(text):
                 if not args.use_subwords:
                     lang_texts = [
                         maybe_pad(text)
-                        for text, lang in zip(examples[args.text_column], examples["lang"])
+                        for text, lang in zip(examples["input_ids"], examples["lang"])
                         if lang == current_lang
                     ]
                 else:
@@ -520,7 +520,7 @@ def maybe_pad(text):
         else:
             # this is no longer used and would cause an error otherwise
             with training_args.main_process_first():
-                dataset = dataset.remove_columns([args.text_column])
+                dataset = dataset.rename_column(args.text_column, "input_ids")
         logger.warning(f"Tokenized {split} dataset.")
 
         if split == "train":