Skip to content

Commit

Permalink
fix tokenizer issues
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Sep 11, 2024
1 parent 25a5919 commit a147fd5
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 7 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ testing = [
"pytest-timeout",
"pytest-xdist",
"moto[s3,server]",
"spacy[ja]"
]
all = [
"datatrove[quality]",
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/utils/word_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def span_tokenize(self, text: str) -> list[tuple[int, int]]:
Languages.korean: lambda: KiwiTokenizer(),
Languages.german: lambda: SpaCyTokenizer("de"),
Languages.french: lambda: SpaCyTokenizer("fr"),
Languages.czech: lambda: SpaCyTokenizer("cz"),
Languages.czech: lambda: SpaCyTokenizer("cs"),
Languages.danish: lambda: SpaCyTokenizer("da"),
Languages.dutch: lambda: SpaCyTokenizer("nl"),
Languages.estonian: lambda: SpaCyTokenizer("et"),
Expand Down
84 changes: 78 additions & 6 deletions tests/pipeline/test_word_tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import unittest

from nltk.tokenize import word_tokenize

from datatrove.utils.word_tokenizers import WORD_TOKENIZER_FACTORY, load_word_tokenizer


SAMPLE_TEXT = (
"I wish it need not have happened in my time,' said Frodo. 'So do I,' said Gandalf, 'and so do all who live to "
"'I wish it need not have happened in my time,' said Frodo. 'So do I,' said Gandalf, 'and so do all who live to "
"see such times. But that is not for them to decide. All we have to decide is what to do with the time that is "
"given us.' Hello world! \n\n ქართული \n\t Hello\nworld! "
)
Expand Down Expand Up @@ -39,9 +37,83 @@ def test_span_tokenizers(self):
assert all(spans_match_sents), f"'{language}' tokenizer spans don't match with sentences"

def test_english_tokenizer(self):
nltk_words = word_tokenize(SAMPLE_TEXT, language="english")

en_tokenizer = load_word_tokenizer("en")
tokenizer_words = en_tokenizer.word_tokenize(SAMPLE_TEXT)

self.assertEqual(nltk_words, tokenizer_words, "NLTK tokenizer and multilingual tokenizer differ")
self.assertEqual(
[
"'",
"I",
"wish",
"it",
"need",
"not",
"have",
"happened",
"in",
"my",
"time",
",",
"'",
"said",
"Frodo",
".",
"'",
"So",
"do",
"I",
",",
"'",
"said",
"Gandalf",
",",
"'",
"and",
"so",
"do",
"all",
"who",
"live",
"to",
"see",
"such",
"times",
".",
"But",
"that",
"is",
"not",
"for",
"them",
"to",
"decide",
".",
"All",
"we",
"have",
"to",
"decide",
"is",
"what",
"to",
"do",
"with",
"the",
"time",
"that",
"is",
"given",
"us",
".",
"'",
"Hello",
"world",
"!",
"ქართული",
"Hello",
"world",
"!",
],
tokenizer_words,
"NLTK tokenizer and multilingual tokenizer differ",
)

0 comments on commit a147fd5

Please sign in to comment.