Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: 1.x - nltk upgrade, use nltk.download('punkt_tab') #8256

Merged
merged 15 commits into from
Aug 29, 2024
11 changes: 9 additions & 2 deletions haystack/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,18 @@ def __init__(
nltk.data.find("tokenizers/punkt")
except LookupError:
try:
nltk.download("punkt")
nltk.download("punkt_tab")
except FileExistsError as error:
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
pass

if tokenizer_model_folder is not None:
warnings.warn(
"Custom NLTK tokenizers are no longer allowed. "
"The 'tokenizer_model_folder' parameter will be ignored. "
"Please use the built-in nltk tokenizers instead by specifying the `language` parameter."
)
self.tokenizer_model_folder = None
self.clean_whitespace = clean_whitespace
self.clean_header_footer = clean_header_footer
self.clean_empty_lines = clean_empty_lines
Expand All @@ -134,7 +142,6 @@ def __init__(
self.split_respect_sentence_boundary = split_respect_sentence_boundary
self.tokenizer = tokenizer
self.language = language
self.tokenizer_model_folder = tokenizer_model_folder
self.print_log: Set[str] = set()
self.id_hash_keys = id_hash_keys
self.progress_bar = progress_bar
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ dependencies = [
[project.optional-dependencies]
inference = [
"transformers[torch,sentencepiece]==4.39.3",
"sentence-transformers>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
"sentence-transformers<=3.0.0,>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
"huggingface-hub>=0.5.0",
]
elasticsearch = [
Expand Down Expand Up @@ -147,13 +147,13 @@ crawler = [
"selenium>=4.11.0"
]
preprocessing = [
"nltk",
"nltk>=3.9",
vblagoje marked this conversation as resolved.
Show resolved Hide resolved
"langdetect", # for language classification
]
file-conversion = [
"azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator)
"python-docx",
"python-pptx",
"python-pptx<=1.0",
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
"tika", # Apache Tika (text & metadata extractor)
"beautifulsoup4",
"markdown",
Expand Down Expand Up @@ -191,7 +191,7 @@ colab = [
dev = [
"pre-commit",
# Type check
"mypy",
"mypy==1.10.0",
# Test
"pytest",
"pytest-cov",
Expand Down
5 changes: 5 additions & 0 deletions releasenotes/notes/upgrade-ntlk-1e94de2d6f5dd3b6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fixes:
- Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705.
upgrade:
- |
Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705. Due to these security vulnerabilities, it is not possible to use custom NLTK tokenizer models with the new version (for example in PreProcessor). Users can still use built-in nltk tokenizers by specifying the language parameter in the PreProcessor. See PreProcessor documentation for more details.
1 change: 1 addition & 0 deletions test/nodes/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ def test_preprocess_sentence_split_custom_models_non_default_language(split_leng

@pytest.mark.unit
@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
@pytest.mark.skip(reason="Skipped after upgrade to nltk 3.9, can't load this model pt anymore")
def test_preprocess_sentence_split_custom_models(split_length_and_results, samples_path):
split_length, expected_documents_count = split_length_and_results

Expand Down
Loading