From 06399e8c68a1021540938e2558ffd6a46c33e1ff Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 29 Aug 2024 15:05:56 +0200 Subject: [PATCH] Use PunktTokenizer instead of nltk.data.load --- haystack/nodes/preprocessor/preprocessor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 13c0b70393..9d76909bda 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -28,6 +28,7 @@ with LazyImport("Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk'") as nltk_import: import nltk + from nltk.tokenize.punkt import PunktTokenizer iso639_to_nltk = { "ru": "russian", @@ -929,14 +930,14 @@ def _load_sentence_tokenizer(self, language_name: Optional[str]) -> "nltk.tokeni # Use a default NLTK model elif language_name is not None: - sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle") + sentence_tokenizer = PunktTokenizer(language_name) else: logger.error( "PreProcessor couldn't find the default sentence tokenizer model for %s. " " Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.", self.language, ) - sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") + sentence_tokenizer = PunktTokenizer() # default english model return sentence_tokenizer