Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Move potential nltk download to warm_up #8646

Merged
merged 8 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 19 additions & 8 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,10 @@ def __init__( # pylint: disable=too-many-positional-arguments
splitting_function=splitting_function,
respect_sentence_boundary=respect_sentence_boundary,
)

if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"):
self._use_sentence_splitter = split_by == "sentence" or (respect_sentence_boundary and split_by == "word")
if self._use_sentence_splitter:
nltk_imports.check()
self.sentence_splitter = SentenceSplitter(
language=language,
use_split_rules=use_split_rules,
extend_abbreviations=extend_abbreviations,
keep_white_spaces=True,
)
self.sentence_splitter = None

if split_by == "sentence":
# ToDo: remove this warning in the next major release
Expand Down Expand Up @@ -164,6 +159,18 @@ def _init_checks(
)
self.respect_sentence_boundary = False

def warm_up(self):
"""
Warm up the DocumentSplitter by loading the sentence tokenizer.
"""
if self._use_sentence_splitter and self.sentence_splitter is None:
self.sentence_splitter = SentenceSplitter(
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
keep_white_spaces=True,
)

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Expand All @@ -182,6 +189,10 @@ def run(self, documents: List[Document]):
:raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None.
"""
if self._use_sentence_splitter and self.sentence_splitter is None:
raise RuntimeError(
"The component DocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)

if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("DocumentSplitter expects a List of Documents as input.")
Expand Down
32 changes: 23 additions & 9 deletions haystack/components/preprocessors/nltk_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,23 @@ def __init__( # pylint: disable=too-many-positional-arguments
self.respect_sentence_boundary = respect_sentence_boundary
self.use_split_rules = use_split_rules
self.extend_abbreviations = extend_abbreviations
self.sentence_splitter = SentenceSplitter(
language=language,
use_split_rules=use_split_rules,
extend_abbreviations=extend_abbreviations,
keep_white_spaces=True,
)
self.sentence_splitter = None
self.language = language

def warm_up(self):
"""
Warm up the NLTKDocumentSplitter by loading the sentence tokenizer.
"""
if self.sentence_splitter is None:
self.sentence_splitter = SentenceSplitter(
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
keep_white_spaces=True,
)

def _split_into_units(
self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word", "line"]
self, text: str, split_by: Literal["function", "page", "passage", "period", "sentence", "word", "line"]
) -> List[str]:
"""
Splits the text into units based on the specified split_by parameter.
Expand All @@ -106,6 +113,7 @@ def _split_into_units(
# whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
# so split_at is set to an empty string
self.split_at = ""
assert self.sentence_splitter is not None
result = self.sentence_splitter.split_sentences(text)
units = [sentence["sentence"] for sentence in result]
elif split_by == "word":
Expand Down Expand Up @@ -142,6 +150,11 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
:raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None.
"""
if self.sentence_splitter is None:
raise RuntimeError(
"The component NLTKDocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)

if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("DocumentSplitter expects a List of Documents as input.")

Expand Down Expand Up @@ -221,8 +234,9 @@ def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_
break
return num_sentences_to_keep

@staticmethod
def _concatenate_sentences_based_on_word_amount(
self, sentences: List[str], split_length: int, split_overlap: int
sentences: List[str], split_length: int, split_overlap: int
) -> Tuple[List[str], List[int], List[int]]:
"""
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
Expand Down Expand Up @@ -258,7 +272,7 @@ def _concatenate_sentences_based_on_word_amount(
split_start_indices.append(chunk_start_idx)

# Get the number of sentences that overlap with the next chunk
num_sentences_to_keep = self._number_of_sentences_to_keep(
num_sentences_to_keep = NLTKDocumentSplitter._number_of_sentences_to_keep(
sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
)
# Set up information for the new chunk
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Moved the NLTK download of DocumentSplitter and NLTKDocumentSplitter to warm_up(). This prevents calling to an external api during instantiation. If a DocumentSplitter or NLTKDocumentSplitter is used for sentence splitting outside of a pipeline, warm_up() now needs to be called before running the component.
34 changes: 34 additions & 0 deletions test/components/preprocessors/test_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,19 @@ def test_non_text_document(self):
ValueError, match="DocumentSplitter only works with text documents but content for document ID"
):
splitter = DocumentSplitter()
splitter.warm_up()
splitter.run(documents=[Document()])
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text

def test_single_doc(self):
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
splitter = DocumentSplitter()
splitter.warm_up()
splitter.run(documents=Document())

def test_empty_list(self):
splitter = DocumentSplitter()
splitter.warm_up()
res = splitter.run(documents=[])
assert res == {"documents": []}

Expand All @@ -76,6 +79,7 @@ def test_unsupported_split_overlap(self):
def test_split_by_word(self):
splitter = DocumentSplitter(split_by="word", split_length=10)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 2
Expand All @@ -88,6 +92,7 @@ def test_split_by_word(self):

def test_split_by_word_with_threshold(self):
splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10)
splitter.warm_up()
result = splitter.run(
documents=[
Document(
Expand All @@ -105,6 +110,7 @@ def test_split_by_word_multiple_input_docs(self):
splitter = DocumentSplitter(split_by="word", split_length=10)
text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text1), Document(content=text2)])
docs = result["documents"]
assert len(docs) == 5
Expand Down Expand Up @@ -132,6 +138,7 @@ def test_split_by_word_multiple_input_docs(self):
def test_split_by_period(self):
splitter = DocumentSplitter(split_by="period", split_length=1)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
Expand All @@ -148,6 +155,7 @@ def test_split_by_period(self):
def test_split_by_passage(self):
splitter = DocumentSplitter(split_by="passage", split_length=1)
text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
Expand All @@ -164,6 +172,7 @@ def test_split_by_passage(self):
def test_split_by_page(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
Expand All @@ -183,6 +192,7 @@ def test_split_by_page(self):
def test_split_by_function(self):
splitting_function = lambda s: s.split(".")
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
splitter.warm_up()
text = "This.Is.A.Test"
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
docs = result["documents"]
Expand All @@ -200,6 +210,7 @@ def test_split_by_function(self):
splitting_function = lambda s: re.split(r"[\s]{2,}", s)
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
text = "This Is\n A Test"
splitter.warm_up()
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
docs = result["documents"]
assert len(docs) == 4
Expand All @@ -215,6 +226,7 @@ def test_split_by_function(self):
def test_split_by_word_with_overlap(self):
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 2
Expand All @@ -234,6 +246,7 @@ def test_split_by_word_with_overlap(self):
def test_split_by_line(self):
splitter = DocumentSplitter(split_by="line", split_length=1)
text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]

Expand All @@ -252,6 +265,7 @@ def test_source_id_stored_in_metadata(self):
splitter = DocumentSplitter(split_by="word", split_length=10)
doc1 = Document(content="This is a text with some words.")
doc2 = Document(content="This is a different text with some words.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])
assert result["documents"][0].meta["source_id"] == doc1.id
assert result["documents"][1].meta["source_id"] == doc2.id
Expand All @@ -262,6 +276,7 @@ def test_copy_metadata(self):
Document(content="Text.", meta={"name": "doc 0"}),
Document(content="Text.", meta={"name": "doc 1"}),
]
splitter.warm_up()
result = splitter.run(documents=documents)
assert len(result["documents"]) == 2
assert result["documents"][0].id != result["documents"][1].id
Expand All @@ -273,6 +288,7 @@ def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=2)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
Expand All @@ -283,6 +299,7 @@ def test_add_page_number_to_metadata_with_no_overlap_period_split(self):
splitter = DocumentSplitter(split_by="period", split_length=1)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 1]
Expand All @@ -294,6 +311,7 @@ def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])

expected_pages = [1, 2, 2, 2]
Expand All @@ -305,6 +323,7 @@ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]
for doc, p in zip(result["documents"], expected_pages):
Expand All @@ -314,6 +333,7 @@ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 3]

Expand All @@ -324,6 +344,7 @@ def test_add_page_number_to_metadata_with_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
doc1 = Document(content="This is some text. And\f this text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
Expand All @@ -334,6 +355,7 @@ def test_add_page_number_to_metadata_with_overlap_period_split(self):
splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1)
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 2, 1, 1]
Expand All @@ -345,6 +367,7 @@ def test_add_page_number_to_metadata_with_overlap_passage_split(self):
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])

expected_pages = [1, 2, 2]
Expand All @@ -356,6 +379,7 @@ def test_add_page_number_to_metadata_with_overlap_page_split(self):
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]

Expand All @@ -366,6 +390,7 @@ def test_add_split_overlap_information(self):
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
text = "This is a text with some words. There is a second sentence. And a third sentence."
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
splitter.warm_up()
docs = splitter.run(documents=[doc])["documents"]

# check split_overlap is added to all the documents
Expand Down Expand Up @@ -487,6 +512,7 @@ def test_run_empty_document(self):
"""
splitter = DocumentSplitter()
doc = Document(content="")
splitter.warm_up()
results = splitter.run([doc])
assert results["documents"] == []

Expand All @@ -496,6 +522,7 @@ def test_run_document_only_whitespaces(self):
"""
splitter = DocumentSplitter()
doc = Document(content=" ")
splitter.warm_up()
results = splitter.run([doc])
assert results["documents"][0].content == " "

Expand Down Expand Up @@ -543,6 +570,7 @@ def test_run_split_by_sentence_1(self) -> None:
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
"The moon was full."
)
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"]

assert len(documents) == 2
Expand All @@ -568,6 +596,7 @@ def test_run_split_by_sentence_2(self) -> None:
"This is another test sentence. (This is a third test sentence.) "
"This is the last test sentence."
)
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"]

assert len(documents) == 4
Expand Down Expand Up @@ -601,6 +630,7 @@ def test_run_split_by_sentence_3(self) -> None:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()

text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
Expand Down Expand Up @@ -633,6 +663,7 @@ def test_run_split_by_sentence_4(self) -> None:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()

text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
Expand Down Expand Up @@ -660,6 +691,7 @@ def test_run_split_by_word_respect_sentence_boundary(self) -> None:
language="en",
respect_sentence_boundary=True,
)
document_splitter.warm_up()

text = (
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
Expand Down Expand Up @@ -692,6 +724,7 @@ def test_run_split_by_word_respect_sentence_boundary_no_repeats(self) -> None:
use_split_rules=False,
extend_abbreviations=False,
)
document_splitter.warm_up()
text = (
"This is a test sentence with many many words that exceeds the split length and should not be repeated. "
"This is another test sentence. (This is a third test sentence.) "
Expand All @@ -717,6 +750,7 @@ def test_run_split_by_word_respect_sentence_boundary_with_split_overlap_and_page
extend_abbreviations=True,
respect_sentence_boundary=True,
)
document_splitter.warm_up()

text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
Expand Down
Loading
Loading