Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: 1.x - nltk upgrade, use nltk.download('punkt_tab') #8256

Merged
merged 15 commits into from
Aug 29, 2024
2 changes: 1 addition & 1 deletion haystack/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def __init__(
nltk.data.find("tokenizers/punkt")
except LookupError:
try:
nltk.download("punkt")
nltk.download("punkt_tab")
except FileExistsError as error:
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
pass
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ dependencies = [
[project.optional-dependencies]
inference = [
"transformers[torch,sentencepiece]==4.39.3",
"sentence-transformers>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
"sentence-transformers<=3.0.0,>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
"huggingface-hub>=0.5.0",
]
elasticsearch = [
Expand Down Expand Up @@ -147,13 +147,13 @@ crawler = [
"selenium>=4.11.0"
]
preprocessing = [
"nltk",
"nltk>=3.9",
vblagoje marked this conversation as resolved.
Show resolved Hide resolved
"langdetect", # for language classification
]
file-conversion = [
"azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator)
"python-docx",
"python-pptx",
"python-pptx<=1.0",
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
"tika", # Apache Tika (text & metadata extractor)
"beautifulsoup4",
"markdown",
Expand Down Expand Up @@ -191,7 +191,7 @@ colab = [
dev = [
"pre-commit",
# Type check
"mypy",
"mypy==1.10.0",
# Test
"pytest",
"pytest-cov",
Expand Down
2 changes: 2 additions & 0 deletions releasenotes/notes/upgrade-ntlk-1e94de2d6f5dd3b6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
fixes:
- Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705. Due to these security vulnerabilities, it is not possible to use custom NLTK tokenizer models with the new version (for example in PreProcessor).
1 change: 1 addition & 0 deletions test/nodes/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ def test_preprocess_sentence_split_custom_models_non_default_language(split_leng

@pytest.mark.unit
@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
@pytest.mark.skip(reason="Skipped after upgrade to nltk 3.9, can't load this model pt anymore")
def test_preprocess_sentence_split_custom_models(split_length_and_results, samples_path):
split_length, expected_documents_count = split_length_and_results

Expand Down
Loading