deepset-ai · davidsbatista · Dec 12, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 6, 2024
diff --git a/haystack/components/preprocessors/__init__.py b/haystack/components/preprocessors/__init__.py
@@ -4,7 +4,6 @@
 
 from .document_cleaner import DocumentCleaner
 from .document_splitter import DocumentSplitter
-from .nltk_document_splitter import NLTKDocumentSplitter
 from .text_cleaner import TextCleaner
 
-__all__ = ["DocumentSplitter", "DocumentCleaner", "TextCleaner", "NLTKDocumentSplitter"]
+__all__ = ["DocumentSplitter", "DocumentCleaner", "TextCleaner"]
diff --git a/haystack/components/preprocessors/nltk_document_splitter.py b/haystack/components/preprocessors/nltk_document_splitter.py
@@ -186,11 +186,16 @@ def _needs_join(
         """
         Checks if the spans need to be joined as parts of one sentence.
 
+        This method determines whether two adjacent sentence spans should be joined back together as a single sentence.
+        It's used to prevent incorrect sentence splitting in specific cases like quotations, numbered lists,
+        and parenthetical expressions.
+
         :param text: The text containing the spans.
-        :param span: The current sentence span within text.
-        :param next_span: The next sentence span within text.
+        :param span: Tuple of (start, end) positions for the current sentence span.
+        :param next_span: Tuple of (start, end) positions for the next sentence span.
         :param quote_spans: All quoted spans within text.
-        :returns: True if the spans needs to be joined.
+        :returns:
+            True if the spans needs to be joined.
         """
         start, end = span
         next_start, next_end = next_span
@@ -216,16 +221,16 @@ def _needs_join(
         return re.search(r"^\s*[\(\[]", text[next_start:next_end]) is not None
 
     @staticmethod
-    def _read_abbreviations(language: Language) -> List[str]:
+    def _read_abbreviations(lang: Language) -> List[str]:
         """
         Reads the abbreviations for a given language from the abbreviations file.
 
-        :param language: The language to read the abbreviations for.
+        :param lang: The language to read the abbreviations for.
         :returns: List of abbreviations.
         """
-        abbreviations_file = Path(__file__).parent.parent / f"data/abbreviations/{language}.txt"
+        abbreviations_file = Path(__file__).parent.parent / f"data/abbreviations/{lang}.txt"
         if not abbreviations_file.exists():
-            logger.warning("No abbreviations file found for {language}.Using default abbreviations.", language=language)
+            logger.warning("No abbreviations file found for {language}. Using default abbreviations.", language=lang)
             return []
 
         abbreviations = abbreviations_file.read_text().split("\n")

@@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    The NLTKDocumentSplitter was merged into the DocumentSplitter. You can now make use of a more robust sentence boundary detection, for that you need to initialize the DocumentSplitter with `split_by="nltk_sentence"`.