diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 9dd21978ce..9d76909bda 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -28,6 +28,7 @@ with LazyImport("Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk'") as nltk_import: import nltk + from nltk.tokenize.punkt import PunktTokenizer iso639_to_nltk = { "ru": "russian", @@ -120,10 +121,18 @@ def __init__( nltk.data.find("tokenizers/punkt") except LookupError: try: - nltk.download("punkt") + nltk.download("punkt_tab") except FileExistsError as error: logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error) pass + + if tokenizer_model_folder is not None: + warnings.warn( + "Custom NLTK tokenizers are no longer allowed. " + "The 'tokenizer_model_folder' parameter will be ignored. " + "Please use the built-in nltk tokenizers instead by specifying the `language` parameter." + ) + self.tokenizer_model_folder = None self.clean_whitespace = clean_whitespace self.clean_header_footer = clean_header_footer self.clean_empty_lines = clean_empty_lines @@ -134,7 +143,6 @@ def __init__( self.split_respect_sentence_boundary = split_respect_sentence_boundary self.tokenizer = tokenizer self.language = language - self.tokenizer_model_folder = tokenizer_model_folder self.print_log: Set[str] = set() self.id_hash_keys = id_hash_keys self.progress_bar = progress_bar @@ -922,14 +930,14 @@ def _load_sentence_tokenizer(self, language_name: Optional[str]) -> "nltk.tokeni # Use a default NLTK model elif language_name is not None: - sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle") + sentence_tokenizer = PunktTokenizer(language_name) else: logger.error( "PreProcessor couldn't find the default sentence tokenizer model for %s. " " Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.", self.language, ) - sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") + sentence_tokenizer = PunktTokenizer() # default english model return sentence_tokenizer diff --git a/pyproject.toml b/pyproject.toml index 42b8acda5a..5d99d923b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,7 +87,7 @@ dependencies = [ [project.optional-dependencies] inference = [ "transformers[torch,sentencepiece]==4.39.3", - "sentence-transformers>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder + "sentence-transformers<=3.0.0,>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder "huggingface-hub>=0.5.0", ] elasticsearch = [ @@ -147,13 +147,13 @@ crawler = [ "selenium>=4.11.0" ] preprocessing = [ - "nltk", + "nltk>=3.9.1", "langdetect", # for language classification ] file-conversion = [ "azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator) "python-docx", - "python-pptx", + "python-pptx<=1.0", "tika", # Apache Tika (text & metadata extractor) "beautifulsoup4", "markdown", @@ -191,7 +191,7 @@ colab = [ dev = [ "pre-commit", # Type check - "mypy", + "mypy==1.10.0", # Test "pytest", "pytest-cov", diff --git a/releasenotes/notes/upgrade-ntlk-1e94de2d6f5dd3b6.yaml b/releasenotes/notes/upgrade-ntlk-1e94de2d6f5dd3b6.yaml new file mode 100644 index 0000000000..7aa7817cef --- /dev/null +++ b/releasenotes/notes/upgrade-ntlk-1e94de2d6f5dd3b6.yaml @@ -0,0 +1,9 @@ +fixes: + - | + Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705. +upgrade: + - | + Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705. Due to these security vulnerabilities, it is not possible to use custom NLTK tokenizer models with the new version (for example in PreProcessor). Users can still use built-in nltk tokenizers by specifying the language parameter in the PreProcessor. See PreProcessor documentation for more details. +enhancements: + - | + Pins sentence-transformers<=3.0.0,>=2.3.1 and python-pptx<=1.0 to avoid some minor typing incompatibilities with the newer version of the respective libraries. diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py index b719fe4a74..551e63f4e2 100644 --- a/test/nodes/test_preprocessor.py +++ b/test/nodes/test_preprocessor.py @@ -175,6 +175,7 @@ def test_preprocess_sentence_split_custom_models_non_default_language(split_leng @pytest.mark.unit @pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)]) +@pytest.mark.skip(reason="Skipped after upgrade to nltk 3.9, can't load this model pt anymore") def test_preprocess_sentence_split_custom_models(split_length_and_results, samples_path): split_length, expected_documents_count = split_length_and_results