Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: 1.x - nltk upgrade, use nltk.download('punkt_tab') #8256

Merged
merged 15 commits into from
Aug 29, 2024
16 changes: 12 additions & 4 deletions haystack/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

with LazyImport("Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk'") as nltk_import:
import nltk
from nltk.tokenize.punkt import PunktTokenizer

iso639_to_nltk = {
"ru": "russian",
Expand Down Expand Up @@ -120,10 +121,18 @@ def __init__(
nltk.data.find("tokenizers/punkt")
except LookupError:
try:
nltk.download("punkt")
nltk.download("punkt_tab")
except FileExistsError as error:
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
pass

if tokenizer_model_folder is not None:
warnings.warn(
"Custom NLTK tokenizers are no longer allowed. "
"The 'tokenizer_model_folder' parameter will be ignored. "
"Please use the built-in nltk tokenizers instead by specifying the `language` parameter."
)
self.tokenizer_model_folder = None
self.clean_whitespace = clean_whitespace
self.clean_header_footer = clean_header_footer
self.clean_empty_lines = clean_empty_lines
Expand All @@ -134,7 +143,6 @@ def __init__(
self.split_respect_sentence_boundary = split_respect_sentence_boundary
self.tokenizer = tokenizer
self.language = language
self.tokenizer_model_folder = tokenizer_model_folder
self.print_log: Set[str] = set()
self.id_hash_keys = id_hash_keys
self.progress_bar = progress_bar
Expand Down Expand Up @@ -922,14 +930,14 @@ def _load_sentence_tokenizer(self, language_name: Optional[str]) -> "nltk.tokeni

# Use a default NLTK model
elif language_name is not None:
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
sentence_tokenizer = PunktTokenizer(language_name)
else:
logger.error(
"PreProcessor couldn't find the default sentence tokenizer model for %s. "
" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.",
self.language,
)
sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
sentence_tokenizer = PunktTokenizer() # default english model

return sentence_tokenizer

Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ dependencies = [
[project.optional-dependencies]
inference = [
"transformers[torch,sentencepiece]==4.39.3",
"sentence-transformers>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
"sentence-transformers<=3.0.0,>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
"huggingface-hub>=0.5.0",
]
elasticsearch = [
Expand Down Expand Up @@ -147,13 +147,13 @@ crawler = [
"selenium>=4.11.0"
]
preprocessing = [
"nltk",
"nltk>=3.9.1",
"langdetect", # for language classification
]
file-conversion = [
"azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator)
"python-docx",
"python-pptx",
"python-pptx<=1.0",
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
"tika", # Apache Tika (text & metadata extractor)
"beautifulsoup4",
"markdown",
Expand Down Expand Up @@ -191,7 +191,7 @@ colab = [
dev = [
"pre-commit",
# Type check
"mypy",
"mypy==1.10.0",
# Test
"pytest",
"pytest-cov",
Expand Down
9 changes: 9 additions & 0 deletions releasenotes/notes/upgrade-ntlk-1e94de2d6f5dd3b6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
fixes:
- |
Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705.
upgrade:
- |
Upgrades ntlk to 3.9.1 as prior versions are affect by https://nvd.nist.gov/vuln/detail/CVE-2024-39705. Due to these security vulnerabilities, it is not possible to use custom NLTK tokenizer models with the new version (for example in PreProcessor). Users can still use built-in nltk tokenizers by specifying the language parameter in the PreProcessor. See PreProcessor documentation for more details.
enhancements:
- |
Pins sentence-transformers<=3.0.0,>=2.3.1 and python-pptx<=1.0 to avoid some minor typing incompatibilities with the newer version of the respective libraries.
1 change: 1 addition & 0 deletions test/nodes/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ def test_preprocess_sentence_split_custom_models_non_default_language(split_leng

@pytest.mark.unit
@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
@pytest.mark.skip(reason="Skipped after upgrade to nltk 3.9, can't load this model pt anymore")
def test_preprocess_sentence_split_custom_models(split_length_and_results, samples_path):
split_length, expected_documents_count = split_length_and_results

Expand Down
Loading