From a147fd50c32452c871b2b137d6e0e5f1381f4506 Mon Sep 17 00:00:00 2001 From: guipenedo Date: Wed, 11 Sep 2024 12:05:49 +0200 Subject: [PATCH] fix tokenizer issues --- pyproject.toml | 1 + src/datatrove/utils/word_tokenizers.py | 2 +- tests/pipeline/test_word_tokenizers.py | 84 ++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cf226903..2a4ae3ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,6 +88,7 @@ testing = [ "pytest-timeout", "pytest-xdist", "moto[s3,server]", + "spacy[ja]" ] all = [ "datatrove[quality]", diff --git a/src/datatrove/utils/word_tokenizers.py b/src/datatrove/utils/word_tokenizers.py index f43586dc..6c4f7adb 100644 --- a/src/datatrove/utils/word_tokenizers.py +++ b/src/datatrove/utils/word_tokenizers.py @@ -227,7 +227,7 @@ def span_tokenize(self, text: str) -> list[tuple[int, int]]: Languages.korean: lambda: KiwiTokenizer(), Languages.german: lambda: SpaCyTokenizer("de"), Languages.french: lambda: SpaCyTokenizer("fr"), - Languages.czech: lambda: SpaCyTokenizer("cz"), + Languages.czech: lambda: SpaCyTokenizer("cs"), Languages.danish: lambda: SpaCyTokenizer("da"), Languages.dutch: lambda: SpaCyTokenizer("nl"), Languages.estonian: lambda: SpaCyTokenizer("et"), diff --git a/tests/pipeline/test_word_tokenizers.py b/tests/pipeline/test_word_tokenizers.py index 9f67a44d..b8d4698a 100644 --- a/tests/pipeline/test_word_tokenizers.py +++ b/tests/pipeline/test_word_tokenizers.py @@ -1,12 +1,10 @@ import unittest -from nltk.tokenize import word_tokenize - from datatrove.utils.word_tokenizers import WORD_TOKENIZER_FACTORY, load_word_tokenizer SAMPLE_TEXT = ( - "I wish it need not have happened in my time,' said Frodo. 'So do I,' said Gandalf, 'and so do all who live to " + "'I wish it need not have happened in my time,' said Frodo. 'So do I,' said Gandalf, 'and so do all who live to " "see such times. But that is not for them to decide. All we have to decide is what to do with the time that is " "given us.' Hello world! \n\n ქართული \n\t Hello\nworld! " ) @@ -39,9 +37,83 @@ def test_span_tokenizers(self): assert all(spans_match_sents), f"'{language}' tokenizer spans don't match with sentences" def test_english_tokenizer(self): - nltk_words = word_tokenize(SAMPLE_TEXT, language="english") - en_tokenizer = load_word_tokenizer("en") tokenizer_words = en_tokenizer.word_tokenize(SAMPLE_TEXT) - self.assertEqual(nltk_words, tokenizer_words, "NLTK tokenizer and multilingual tokenizer differ") + self.assertEqual( + [ + "'", + "I", + "wish", + "it", + "need", + "not", + "have", + "happened", + "in", + "my", + "time", + ",", + "'", + "said", + "Frodo", + ".", + "'", + "So", + "do", + "I", + ",", + "'", + "said", + "Gandalf", + ",", + "'", + "and", + "so", + "do", + "all", + "who", + "live", + "to", + "see", + "such", + "times", + ".", + "But", + "that", + "is", + "not", + "for", + "them", + "to", + "decide", + ".", + "All", + "we", + "have", + "to", + "decide", + "is", + "what", + "to", + "do", + "with", + "the", + "time", + "that", + "is", + "given", + "us", + ".", + "'", + "Hello", + "world", + "!", + "ქართული", + "Hello", + "world", + "!", + ], + tokenizer_words, + "NLTK tokenizer and multilingual tokenizer differ", + )