deepset-ai · vblagoje · Aug 29, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
@@ -120,7 +120,7 @@ def __init__(
             nltk.data.find("tokenizers/punkt")
         except LookupError:
             try:
-                nltk.download("punkt")
+                nltk.download("punkt_tab")
             except FileExistsError as error:
                 logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
                 pass

@@ -87,7 +87,7 @@ dependencies = [
 [project.optional-dependencies]
 inference = [
   "transformers[torch,sentencepiece]==4.39.3",
-  "sentence-transformers>=2.3.1",  # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
+  "sentence-transformers<=3.0.0,>=2.3.1",  # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
   "huggingface-hub>=0.5.0",
 ]
 elasticsearch = [
@@ -147,13 +147,13 @@ crawler = [
   "selenium>=4.11.0"
 ]
 preprocessing = [
-  "nltk",
+  "nltk>=3.9",
   "langdetect",  # for language classification
 ]
 file-conversion = [
   "azure-ai-formrecognizer>=3.2.0b2",  # Microsoft Azure's Form Recognizer service (text and table exctrator)
   "python-docx",
-  "python-pptx",
+  "python-pptx<=1.0",
   "tika",  # Apache Tika (text & metadata extractor)
   "beautifulsoup4",
   "markdown",
@@ -191,7 +191,7 @@ colab = [
 dev = [
   "pre-commit",
   # Type check
-  "mypy",
+  "mypy==1.10.0",
   # Test
   "pytest",
   "pytest-cov",

@@ -0,0 +1,2 @@
+fixes:
+  - Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705. Due to these security vulnerabilities, it is not possible to use custom NLTK tokenizer models with the new version (for example in PreProcessor).
@@ -175,6 +175,7 @@ def test_preprocess_sentence_split_custom_models_non_default_language(split_leng
 
 @pytest.mark.unit
 @pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
+@pytest.mark.skip(reason="Skipped after upgrade to nltk 3.9, can't load this model pt anymore")
 def test_preprocess_sentence_split_custom_models(split_length_and_results, samples_path):
     split_length, expected_documents_count = split_length_and_results
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		fixes:
		- Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705. Due to these security vulnerabilities, it is not possible to use custom NLTK tokenizer models with the new version (for example in PreProcessor).