fix tokenizer issues

huggingface · Sep 11, 2024 · a147fd5 · a147fd5
1 parent 25a5919
commit a147fd5
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 7 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -88,6 +88,7 @@ testing = [
   "pytest-timeout",
   "pytest-xdist",
   "moto[s3,server]",
+  "spacy[ja]"
 ]
 all = [
   "datatrove[quality]",

diff --git a/src/datatrove/utils/word_tokenizers.py b/src/datatrove/utils/word_tokenizers.py
@@ -227,7 +227,7 @@ def span_tokenize(self, text: str) -> list[tuple[int, int]]:
     Languages.korean: lambda: KiwiTokenizer(),
     Languages.german: lambda: SpaCyTokenizer("de"),
     Languages.french: lambda: SpaCyTokenizer("fr"),
-    Languages.czech: lambda: SpaCyTokenizer("cz"),
+    Languages.czech: lambda: SpaCyTokenizer("cs"),
     Languages.danish: lambda: SpaCyTokenizer("da"),
     Languages.dutch: lambda: SpaCyTokenizer("nl"),
     Languages.estonian: lambda: SpaCyTokenizer("et"),

diff --git a/tests/pipeline/test_word_tokenizers.py b/tests/pipeline/test_word_tokenizers.py
@@ -1,12 +1,10 @@
 import unittest
 
-from nltk.tokenize import word_tokenize
-
 from datatrove.utils.word_tokenizers import WORD_TOKENIZER_FACTORY, load_word_tokenizer
 
 
 SAMPLE_TEXT = (
-    "I wish it need not have happened in my time,' said Frodo. 'So do I,' said Gandalf, 'and so do all who live to "
+    "'I wish it need not have happened in my time,' said Frodo. 'So do I,' said Gandalf, 'and so do all who live to "
     "see such times. But that is not for them to decide. All we have to decide is what to do with the time that is "
     "given us.' Hello world! \n\n ქართული \n\t Hello\nworld! "
 )
@@ -39,9 +37,83 @@ def test_span_tokenizers(self):
             assert all(spans_match_sents), f"'{language}' tokenizer spans don't match with sentences"
 
     def test_english_tokenizer(self):
-        nltk_words = word_tokenize(SAMPLE_TEXT, language="english")
-
         en_tokenizer = load_word_tokenizer("en")
         tokenizer_words = en_tokenizer.word_tokenize(SAMPLE_TEXT)
 
-        self.assertEqual(nltk_words, tokenizer_words, "NLTK tokenizer and multilingual tokenizer differ")
+        self.assertEqual(
+            [
+                "'",
+                "I",
+                "wish",
+                "it",
+                "need",
+                "not",
+                "have",
+                "happened",
+                "in",
+                "my",
+                "time",
+                ",",
+                "'",
+                "said",
+                "Frodo",
+                ".",
+                "'",
+                "So",
+                "do",
+                "I",
+                ",",
+                "'",
+                "said",
+                "Gandalf",
+                ",",
+                "'",
+                "and",
+                "so",
+                "do",
+                "all",
+                "who",
+                "live",
+                "to",
+                "see",
+                "such",
+                "times",
+                ".",
+                "But",
+                "that",
+                "is",
+                "not",
+                "for",
+                "them",
+                "to",
+                "decide",
+                ".",
+                "All",
+                "we",
+                "have",
+                "to",
+                "decide",
+                "is",
+                "what",
+                "to",
+                "do",
+                "with",
+                "the",
+                "time",
+                "that",
+                "is",
+                "given",
+                "us",
+                ".",
+                "'",
+                "Hello",
+                "world",
+                "!",
+                "ქართული",
+                "Hello",
+                "world",
+                "!",
+            ],
+            tokenizer_words,
+            "NLTK tokenizer and multilingual tokenizer differ",
+        )