diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py
index 86d95f412a..b3e99924a7 100644
--- a/haystack/components/preprocessors/document_splitter.py
+++ b/haystack/components/preprocessors/document_splitter.py
@@ -2,20 +2,21 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import warnings
 from copy import deepcopy
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
 
 from more_itertools import windowed
 
 from haystack import Document, component, logging
+from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports
 from haystack.core.serialization import default_from_dict, default_to_dict
 from haystack.utils import deserialize_callable, serialize_callable
 
 logger = logging.getLogger(__name__)
 
-# Maps the 'split_by' argument to the actual char used to split the Documents.
-# 'function' is not in the mapping cause it doesn't split on chars.
-_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "sentence": ".", "word": " ", "line": "\n"}
+# mapping of split by character, 'function' and 'sentence' don't split by character
+_CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
 
 
 @component
@@ -23,8 +24,7 @@ class DocumentSplitter:
     """
     Splits long documents into smaller chunks.
 
-    This is a common preprocessing step during indexing.
-    It helps Embedders create meaningful semantic representations
+    This is a common preprocessing step during indexing. It helps Embedders create meaningful semantic representations
     and prevents exceeding language model context limits.
 
     The DocumentSplitter is compatible with the following DocumentStores:
@@ -54,18 +54,27 @@ class DocumentSplitter:
 
     def __init__(  # pylint: disable=too-many-positional-arguments
         self,
-        split_by: Literal["function", "page", "passage", "sentence", "word", "line"] = "word",
+        split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
         split_length: int = 200,
         split_overlap: int = 0,
         split_threshold: int = 0,
         splitting_function: Optional[Callable[[str], List[str]]] = None,
+        respect_sentence_boundary: bool = False,
+        language: Language = "en",
+        use_split_rules: bool = True,
+        extend_abbreviations: bool = True,
     ):
         """
         Initialize DocumentSplitter.
 
-        :param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
-            `sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\f"),
-            `passage` for splitting by double line breaks ("\\n\\n") or `line` for splitting each line ("\\n").
+        :param split_by: The unit for splitting your documents. Choose from:
+            - `word` for splitting by spaces (" ")
+            - `period` for splitting by periods (".")
+            - `page` for splitting by form feed ("\\f")
+            - `passage` for splitting by double line breaks ("\\n\\n")
+            - `line` for splitting each line ("\\n")
+            - `sentence` for splitting by NLTK sentence tokenizer
+
         :param split_length: The maximum number of units in each split.
         :param split_overlap: The number of overlapping units for each split.
         :param split_threshold: The minimum number of units per split. If a split has fewer units
@@ -73,21 +82,87 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         :param splitting_function: Necessary when `split_by` is set to "function".
             This is a function which must accept a single `str` as input and return a `list` of `str` as output,
             representing the chunks after splitting.
+        :param respect_sentence_boundary: Choose whether to respect sentence boundaries when splitting by "word".
+            If True, uses NLTK to detect sentence boundaries, ensuring splits occur only between sentences.
+        :param language: Choose the language for the NLTK tokenizer. The default is English ("en").
+        :param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`.
+        :param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list
+            of curated abbreviations, if available. This is currently supported for English ("en") and German ("de").
         """
 
         self.split_by = split_by
-        if split_by not in ["function", "page", "passage", "sentence", "word", "line"]:
-            raise ValueError("split_by must be one of 'function', 'word', 'sentence', 'page', 'passage' or 'line'.")
+        self.split_length = split_length
+        self.split_overlap = split_overlap
+        self.split_threshold = split_threshold
+        self.splitting_function = splitting_function
+        self.respect_sentence_boundary = respect_sentence_boundary
+        self.language = language
+        self.use_split_rules = use_split_rules
+        self.extend_abbreviations = extend_abbreviations
+
+        self._init_checks(
+            split_by=split_by,
+            split_length=split_length,
+            split_overlap=split_overlap,
+            splitting_function=splitting_function,
+            respect_sentence_boundary=respect_sentence_boundary,
+        )
+
+        if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"):
+            nltk_imports.check()
+            self.sentence_splitter = SentenceSplitter(
+                language=language,
+                use_split_rules=use_split_rules,
+                extend_abbreviations=extend_abbreviations,
+                keep_white_spaces=True,
+            )
+
+        if split_by == "sentence":
+            # ToDo: remove this warning in the next major release
+            msg = (
+                "The `split_by='sentence'` no longer splits by '.' and now relies on custom sentence tokenizer "
+                "based on NLTK. To achieve the previous behaviour use `split_by='period'."
+            )
+            warnings.warn(msg)
+
+    def _init_checks(
+        self,
+        *,
+        split_by: str,
+        split_length: int,
+        split_overlap: int,
+        splitting_function: Optional[Callable],
+        respect_sentence_boundary: bool,
+    ) -> None:
+        """
+        Validates initialization parameters for DocumentSplitter.
+
+        :param split_by: The unit for splitting documents
+        :param split_length: The maximum number of units in each split
+        :param split_overlap: The number of overlapping units for each split
+        :param splitting_function: Custom function for splitting when split_by="function"
+        :param respect_sentence_boundary: Whether to respect sentence boundaries when splitting
+        :raises ValueError: If any parameter is invalid
+        """
+        valid_split_by = ["function", "page", "passage", "period", "word", "line", "sentence"]
+        if split_by not in valid_split_by:
+            raise ValueError(f"split_by must be one of {', '.join(valid_split_by)}.")
+
         if split_by == "function" and splitting_function is None:
             raise ValueError("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided.")
+
         if split_length <= 0:
             raise ValueError("split_length must be greater than 0.")
-        self.split_length = split_length
+
         if split_overlap < 0:
             raise ValueError("split_overlap must be greater than or equal to 0.")
-        self.split_overlap = split_overlap
-        self.split_threshold = split_threshold
-        self.splitting_function = splitting_function
+
+        if respect_sentence_boundary and split_by != "word":
+            logger.warning(
+                "The 'respect_sentence_boundary' option is only supported for `split_by='word'`. "
+                "The option `respect_sentence_boundary` will be set to `False`."
+            )
+            self.respect_sentence_boundary = False
 
     @component.output_types(documents=List[Document])
     def run(self, documents: List[Document]):
@@ -98,7 +173,6 @@ def run(self, documents: List[Document]):
         and an overlap of `split_overlap`.
 
         :param documents: The documents to split.
-
         :returns: A dictionary with the following key:
             - `documents`: List of documents with the split texts. Each document includes:
                 - A metadata field `source_id` to track the original document.
@@ -121,39 +195,69 @@ def run(self, documents: List[Document]):
             if doc.content == "":
                 logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
                 continue
-            split_docs += self._split(doc)
+
+            split_docs += self._split_document(doc)
         return {"documents": split_docs}
 
-    def _split(self, to_split: Document) -> List[Document]:
-        # We already check this before calling _split but
-        # we need to make linters happy
-        if to_split.content is None:
-            return []
+    def _split_document(self, doc: Document) -> List[Document]:
+        if self.split_by == "sentence" or self.respect_sentence_boundary:
+            return self._split_by_nltk_sentence(doc)
 
         if self.split_by == "function" and self.splitting_function is not None:
-            splits = self.splitting_function(to_split.content)
-            docs: List[Document] = []
-            for s in splits:
-                meta = deepcopy(to_split.meta)
-                meta["source_id"] = to_split.id
-                docs.append(Document(content=s, meta=meta))
-            return docs
-
-        split_at = _SPLIT_BY_MAPPING[self.split_by]
-        units = to_split.content.split(split_at)
+            return self._split_by_function(doc)
+
+        return self._split_by_character(doc)
+
+    def _split_by_nltk_sentence(self, doc: Document) -> List[Document]:
+        split_docs = []
+
+        result = self.sentence_splitter.split_sentences(doc.content)  # type: ignore # None check is done in run()
+        units = [sentence["sentence"] for sentence in result]
+
+        if self.respect_sentence_boundary:
+            text_splits, splits_pages, splits_start_idxs = self._concatenate_sentences_based_on_word_amount(
+                sentences=units, split_length=self.split_length, split_overlap=self.split_overlap
+            )
+        else:
+            text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
+                elements=units,
+                split_length=self.split_length,
+                split_overlap=self.split_overlap,
+                split_threshold=self.split_threshold,
+            )
+        metadata = deepcopy(doc.meta)
+        metadata["source_id"] = doc.id
+        split_docs += self._create_docs_from_splits(
+            text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
+        )
+
+        return split_docs
+
+    def _split_by_character(self, doc) -> List[Document]:
+        split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by]
+        units = doc.content.split(split_at)
         # Add the delimiter back to all units except the last one
         for i in range(len(units) - 1):
             units[i] += split_at
-
         text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
             units, self.split_length, self.split_overlap, self.split_threshold
         )
-        metadata = deepcopy(to_split.meta)
-        metadata["source_id"] = to_split.id
+        metadata = deepcopy(doc.meta)
+        metadata["source_id"] = doc.id
         return self._create_docs_from_splits(
             text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
         )
 
+    def _split_by_function(self, doc) -> List[Document]:
+        # the check for None is done already in the run method
+        splits = self.splitting_function(doc.content)  # type: ignore
+        docs: List[Document] = []
+        for s in splits:
+            meta = deepcopy(doc.meta)
+            meta["source_id"] = doc.id
+            docs.append(Document(content=s, meta=meta))
+        return docs
+
     def _concatenate_units(
         self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
     ) -> Tuple[List[str], List[int], List[int]]:
@@ -265,6 +369,10 @@ def to_dict(self) -> Dict[str, Any]:
             split_length=self.split_length,
             split_overlap=self.split_overlap,
             split_threshold=self.split_threshold,
+            respect_sentence_boundary=self.respect_sentence_boundary,
+            language=self.language,
+            use_split_rules=self.use_split_rules,
+            extend_abbreviations=self.extend_abbreviations,
         )
         if self.splitting_function:
             serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)
@@ -282,3 +390,99 @@ def from_dict(cls, data: Dict[str, Any]) -> "DocumentSplitter":
             init_params["splitting_function"] = deserialize_callable(splitting_function)
 
         return default_from_dict(cls, data)
+
+    @staticmethod
+    def _concatenate_sentences_based_on_word_amount(
+        sentences: List[str], split_length: int, split_overlap: int
+    ) -> Tuple[List[str], List[int], List[int]]:
+        """
+        Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
+
+        This function is only used when splitting by `word` and `respect_sentence_boundary` is set to `True`, i.e.:
+        with NLTK sentence tokenizer.
+
+        :param sentences: The list of sentences to split.
+        :param split_length: The maximum number of words in each split.
+        :param split_overlap: The number of overlapping words in each split.
+        :returns: A tuple containing the concatenated sentences, the start page numbers, and the start indices.
+        """
+        # chunk information
+        chunk_word_count = 0
+        chunk_starting_page_number = 1
+        chunk_start_idx = 0
+        current_chunk: List[str] = []
+        # output lists
+        split_start_page_numbers = []
+        list_of_splits: List[List[str]] = []
+        split_start_indices = []
+
+        for sentence_idx, sentence in enumerate(sentences):
+            current_chunk.append(sentence)
+            chunk_word_count += len(sentence.split())
+            next_sentence_word_count = (
+                len(sentences[sentence_idx + 1].split()) if sentence_idx < len(sentences) - 1 else 0
+            )
+
+            # Number of words in the current chunk plus the next sentence is larger than the split_length,
+            # or we reached the last sentence
+            if (chunk_word_count + next_sentence_word_count) > split_length or sentence_idx == len(sentences) - 1:
+                #  Save current chunk and start a new one
+                list_of_splits.append(current_chunk)
+                split_start_page_numbers.append(chunk_starting_page_number)
+                split_start_indices.append(chunk_start_idx)
+
+                # Get the number of sentences that overlap with the next chunk
+                num_sentences_to_keep = DocumentSplitter._number_of_sentences_to_keep(
+                    sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
+                )
+                # Set up information for the new chunk
+                if num_sentences_to_keep > 0:
+                    # Processed sentences are the ones that are not overlapping with the next chunk
+                    processed_sentences = current_chunk[:-num_sentences_to_keep]
+                    chunk_starting_page_number += sum(sent.count("\f") for sent in processed_sentences)
+                    chunk_start_idx += len("".join(processed_sentences))
+                    # Next chunk starts with the sentences that were overlapping with the previous chunk
+                    current_chunk = current_chunk[-num_sentences_to_keep:]
+                    chunk_word_count = sum(len(s.split()) for s in current_chunk)
+                else:
+                    # Here processed_sentences is the same as current_chunk since there is no overlap
+                    chunk_starting_page_number += sum(sent.count("\f") for sent in current_chunk)
+                    chunk_start_idx += len("".join(current_chunk))
+                    current_chunk = []
+                    chunk_word_count = 0
+
+        # Concatenate the sentences together within each split
+        text_splits = []
+        for split in list_of_splits:
+            text = "".join(split)
+            if len(text) > 0:
+                text_splits.append(text)
+
+        return text_splits, split_start_page_numbers, split_start_indices
+
+    @staticmethod
+    def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_overlap: int) -> int:
+        """
+        Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
+
+        :param sentences: The list of sentences to split.
+        :param split_length: The maximum number of words in each split.
+        :param split_overlap: The number of overlapping words in each split.
+        :returns: The number of sentences to keep in the next chunk.
+        """
+        # If the split_overlap is 0, we don't need to keep any sentences
+        if split_overlap == 0:
+            return 0
+
+        num_sentences_to_keep = 0
+        num_words = 0
+        # Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence
+        for sent in reversed(sentences[1:]):
+            num_words += len(sent.split())
+            # If the number of words is larger than the split_length then don't add any more sentences
+            if num_words > split_length:
+                break
+            num_sentences_to_keep += 1
+            if num_words > split_overlap:
+                break
+        return num_sentences_to_keep
diff --git a/haystack/components/preprocessors/sentence_tokenizer.py b/haystack/components/preprocessors/sentence_tokenizer.py
index 4932513452..505126e901 100644
--- a/haystack/components/preprocessors/sentence_tokenizer.py
+++ b/haystack/components/preprocessors/sentence_tokenizer.py
@@ -186,11 +186,16 @@ def _needs_join(
         """
         Checks if the spans need to be joined as parts of one sentence.
 
+        This method determines whether two adjacent sentence spans should be joined back together as a single sentence.
+        It's used to prevent incorrect sentence splitting in specific cases like quotations, numbered lists,
+        and parenthetical expressions.
+
         :param text: The text containing the spans.
-        :param span: The current sentence span within text.
-        :param next_span: The next sentence span within text.
+        :param span: Tuple of (start, end) positions for the current sentence span.
+        :param next_span: Tuple of (start, end) positions for the next sentence span.
         :param quote_spans: All quoted spans within text.
-        :returns: True if the spans needs to be joined.
+        :returns:
+            True if the spans needs to be joined.
         """
         start, end = span
         next_start, next_end = next_span
@@ -216,16 +221,16 @@ def _needs_join(
         return re.search(r"^\s*[\(\[]", text[next_start:next_end]) is not None
 
     @staticmethod
-    def _read_abbreviations(language: Language) -> List[str]:
+    def _read_abbreviations(lang: Language) -> List[str]:
         """
         Reads the abbreviations for a given language from the abbreviations file.
 
-        :param language: The language to read the abbreviations for.
+        :param lang: The language to read the abbreviations for.
         :returns: List of abbreviations.
         """
-        abbreviations_file = Path(__file__).parent.parent / f"data/abbreviations/{language}.txt"
+        abbreviations_file = Path(__file__).parent.parent / f"data/abbreviations/{lang}.txt"
         if not abbreviations_file.exists():
-            logger.warning("No abbreviations file found for {language}.Using default abbreviations.", language=language)
+            logger.warning("No abbreviations file found for {language}. Using default abbreviations.", language=lang)
             return []
 
         abbreviations = abbreviations_file.read_text().split("\n")
diff --git a/releasenotes/notes/unifying-docsplitter-and-nltkdocsplitter-f01a983c7e7f3ed3.yaml b/releasenotes/notes/unifying-docsplitter-and-nltkdocsplitter-f01a983c7e7f3ed3.yaml
new file mode 100644
index 0000000000..2b2a9d5f1b
--- /dev/null
+++ b/releasenotes/notes/unifying-docsplitter-and-nltkdocsplitter-f01a983c7e7f3ed3.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    The `NLTKDocumentSplitter` was merged into the `DocumentSplitter` which now provides the same functionality as the `NLTKDocumentSplitter`. The `split_by="sentence"` now uses a custom sentence boundary detection based on the `nltk` library. The previous `sentence` behaviour can still be achieved by `split_by="period"`
diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py
index 25872626c1..78767dbccd 100644
--- a/test/components/preprocessors/test_document_splitter.py
+++ b/test/components/preprocessors/test_document_splitter.py
@@ -1,6 +1,8 @@
 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
 #
 # SPDX-License-Identifier: Apache-2.0
+from typing import List
+
 import re
 
 import pytest
@@ -36,7 +38,7 @@ def merge_documents(documents):
     return merged_text
 
 
-class TestDocumentSplitter:
+class TestSplittingByFunctionOrCharacterRegex:
     def test_non_text_document(self):
         with pytest.raises(
             ValueError, match="DocumentSplitter only works with text documents but content for document ID"
@@ -56,11 +58,13 @@ def test_empty_list(self):
         assert res == {"documents": []}
 
     def test_unsupported_split_by(self):
-        with pytest.raises(
-            ValueError, match="split_by must be one of 'function', 'word', 'sentence', 'page', 'passage' or 'line'."
-        ):
+        with pytest.raises(ValueError, match="split_by must be one of "):
             DocumentSplitter(split_by="unsupported")
 
+    def test_undefined_function(self):
+        with pytest.raises(ValueError, match="When 'split_by' is set to 'function', a valid 'splitting_function'"):
+            DocumentSplitter(split_by="function", splitting_function=None)
+
     def test_unsupported_split_length(self):
         with pytest.raises(ValueError, match="split_length must be greater than 0."):
             DocumentSplitter(split_length=0)
@@ -125,8 +129,8 @@ def test_split_by_word_multiple_input_docs(self):
         assert docs[4].meta["split_id"] == 2
         assert docs[4].meta["split_idx_start"] == text2.index(docs[4].content)
 
-    def test_split_by_sentence(self):
-        splitter = DocumentSplitter(split_by="sentence", split_length=1)
+    def test_split_by_period(self):
+        splitter = DocumentSplitter(split_by="period", split_length=1)
         text = "This is a text with some words. There is a second sentence. And there is a third sentence."
         result = splitter.run(documents=[Document(content=text)])
         docs = result["documents"]
@@ -275,8 +279,8 @@ def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
         for doc, p in zip(result["documents"], expected_pages):
             assert doc.meta["page_number"] == p
 
-    def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
-        splitter = DocumentSplitter(split_by="sentence", split_length=1)
+    def test_add_page_number_to_metadata_with_no_overlap_period_split(self):
+        splitter = DocumentSplitter(split_by="period", split_length=1)
         doc1 = Document(content="This is some text.\f This text is on another page.")
         doc2 = Document(content="This content has two.\f\f page brakes.")
         result = splitter.run(documents=[doc1, doc2])
@@ -326,8 +330,8 @@ def test_add_page_number_to_metadata_with_overlap_word_split(self):
         for doc, p in zip(result["documents"], expected_pages):
             assert doc.meta["page_number"] == p
 
-    def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
-        splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
+    def test_add_page_number_to_metadata_with_overlap_period_split(self):
+        splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1)
         doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
         doc2 = Document(content="This content has two.\f\f page brakes. More text.")
         result = splitter.run(documents=[doc1, doc2])
@@ -494,3 +498,301 @@ def test_run_document_only_whitespaces(self):
         doc = Document(content="  ")
         results = splitter.run([doc])
         assert results["documents"][0].content == "  "
+
+
+class TestSplittingNLTKSentenceSplitter:
+    @pytest.mark.parametrize(
+        "sentences, expected_num_sentences",
+        [
+            (["The sun set.", "Moonlight shimmered softly, wolves howled nearby, night enveloped everything."], 0),
+            (["The sun set.", "It was a dark night ..."], 0),
+            (["The sun set.", " The moon was full."], 1),
+            (["The sun.", " The moon."], 1),  # Ignores the first sentence
+            (["Sun", "Moon"], 1),  # Ignores the first sentence even if its inclusion would be < split_overlap
+        ],
+    )
+    def test_number_of_sentences_to_keep(self, sentences: List[str], expected_num_sentences: int) -> None:
+        num_sentences = DocumentSplitter._number_of_sentences_to_keep(
+            sentences=sentences, split_length=5, split_overlap=2
+        )
+        assert num_sentences == expected_num_sentences
+
+    def test_number_of_sentences_to_keep_split_overlap_zero(self) -> None:
+        sentences = [
+            "Moonlight shimmered softly, wolves howled nearby, night enveloped everything.",
+            " It was a dark night ...",
+            " The moon was full.",
+        ]
+        num_sentences = DocumentSplitter._number_of_sentences_to_keep(
+            sentences=sentences, split_length=5, split_overlap=0
+        )
+        assert num_sentences == 0
+
+    def test_run_split_by_sentence_1(self) -> None:
+        document_splitter = DocumentSplitter(
+            split_by="sentence",
+            split_length=2,
+            split_overlap=0,
+            split_threshold=0,
+            language="en",
+            use_split_rules=True,
+            extend_abbreviations=True,
+        )
+
+        text = (
+            "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
+            "The moon was full."
+        )
+        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
+
+        assert len(documents) == 2
+        assert (
+            documents[0].content == "Moonlight shimmered softly, wolves howled nearby, night enveloped "
+            "everything. It was a dark night ... "
+        )
+        assert documents[1].content == "The moon was full."
+
+    def test_run_split_by_sentence_2(self) -> None:
+        document_splitter = DocumentSplitter(
+            split_by="sentence",
+            split_length=1,
+            split_overlap=0,
+            split_threshold=0,
+            language="en",
+            use_split_rules=False,
+            extend_abbreviations=True,
+        )
+
+        text = (
+            "This is a test sentence with many many words that exceeds the split length and should not be repeated. "
+            "This is another test sentence. (This is a third test sentence.) "
+            "This is the last test sentence."
+        )
+        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
+
+        assert len(documents) == 4
+        assert (
+            documents[0].content
+            == "This is a test sentence with many many words that exceeds the split length and should not be repeated. "
+        )
+        assert documents[0].meta["page_number"] == 1
+        assert documents[0].meta["split_id"] == 0
+        assert documents[0].meta["split_idx_start"] == text.index(documents[0].content)
+        assert documents[1].content == "This is another test sentence. "
+        assert documents[1].meta["page_number"] == 1
+        assert documents[1].meta["split_id"] == 1
+        assert documents[1].meta["split_idx_start"] == text.index(documents[1].content)
+        assert documents[2].content == "(This is a third test sentence.) "
+        assert documents[2].meta["page_number"] == 1
+        assert documents[2].meta["split_id"] == 2
+        assert documents[2].meta["split_idx_start"] == text.index(documents[2].content)
+        assert documents[3].content == "This is the last test sentence."
+        assert documents[3].meta["page_number"] == 1
+        assert documents[3].meta["split_id"] == 3
+        assert documents[3].meta["split_idx_start"] == text.index(documents[3].content)
+
+    def test_run_split_by_sentence_3(self) -> None:
+        document_splitter = DocumentSplitter(
+            split_by="sentence",
+            split_length=1,
+            split_overlap=0,
+            split_threshold=0,
+            language="en",
+            use_split_rules=True,
+            extend_abbreviations=True,
+        )
+
+        text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
+        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
+
+        assert len(documents) == 4
+        assert documents[0].content == "Sentence on page 1.\f"
+        assert documents[0].meta["page_number"] == 1
+        assert documents[0].meta["split_id"] == 0
+        assert documents[0].meta["split_idx_start"] == text.index(documents[0].content)
+        assert documents[1].content == "Sentence on page 2. \f"
+        assert documents[1].meta["page_number"] == 2
+        assert documents[1].meta["split_id"] == 1
+        assert documents[1].meta["split_idx_start"] == text.index(documents[1].content)
+        assert documents[2].content == "Sentence on page 3. \f\f "
+        assert documents[2].meta["page_number"] == 3
+        assert documents[2].meta["split_id"] == 2
+        assert documents[2].meta["split_idx_start"] == text.index(documents[2].content)
+        assert documents[3].content == "Sentence on page 5."
+        assert documents[3].meta["page_number"] == 5
+        assert documents[3].meta["split_id"] == 3
+        assert documents[3].meta["split_idx_start"] == text.index(documents[3].content)
+
+    def test_run_split_by_sentence_4(self) -> None:
+        document_splitter = DocumentSplitter(
+            split_by="sentence",
+            split_length=2,
+            split_overlap=1,
+            split_threshold=0,
+            language="en",
+            use_split_rules=True,
+            extend_abbreviations=True,
+        )
+
+        text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
+        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
+
+        assert len(documents) == 3
+        assert documents[0].content == "Sentence on page 1.\fSentence on page 2. \f"
+        assert documents[0].meta["page_number"] == 1
+        assert documents[0].meta["split_id"] == 0
+        assert documents[0].meta["split_idx_start"] == text.index(documents[0].content)
+        assert documents[1].content == "Sentence on page 2. \fSentence on page 3. \f\f "
+        assert documents[1].meta["page_number"] == 2
+        assert documents[1].meta["split_id"] == 1
+        assert documents[1].meta["split_idx_start"] == text.index(documents[1].content)
+        assert documents[2].content == "Sentence on page 3. \f\f Sentence on page 5."
+        assert documents[2].meta["page_number"] == 3
+        assert documents[2].meta["split_id"] == 2
+        assert documents[2].meta["split_idx_start"] == text.index(documents[2].content)
+
+    def test_run_split_by_word_respect_sentence_boundary(self) -> None:
+        document_splitter = DocumentSplitter(
+            split_by="word",
+            split_length=3,
+            split_overlap=0,
+            split_threshold=0,
+            language="en",
+            respect_sentence_boundary=True,
+        )
+
+        text = (
+            "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
+            "The moon was full."
+        )
+        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
+
+        assert len(documents) == 3
+        assert documents[0].content == "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. "
+        assert documents[0].meta["page_number"] == 1
+        assert documents[0].meta["split_id"] == 0
+        assert documents[0].meta["split_idx_start"] == text.index(documents[0].content)
+        assert documents[1].content == "It was a dark night.\f"
+        assert documents[1].meta["page_number"] == 1
+        assert documents[1].meta["split_id"] == 1
+        assert documents[1].meta["split_idx_start"] == text.index(documents[1].content)
+        assert documents[2].content == "The moon was full."
+        assert documents[2].meta["page_number"] == 2
+        assert documents[2].meta["split_id"] == 2
+        assert documents[2].meta["split_idx_start"] == text.index(documents[2].content)
+
+    def test_run_split_by_word_respect_sentence_boundary_no_repeats(self) -> None:
+        document_splitter = DocumentSplitter(
+            split_by="word",
+            split_length=13,
+            split_overlap=3,
+            split_threshold=0,
+            language="en",
+            respect_sentence_boundary=True,
+            use_split_rules=False,
+            extend_abbreviations=False,
+        )
+        text = (
+            "This is a test sentence with many many words that exceeds the split length and should not be repeated. "
+            "This is another test sentence. (This is a third test sentence.) "
+            "This is the last test sentence."
+        )
+        documents = document_splitter.run([Document(content=text)])["documents"]
+        assert len(documents) == 3
+        assert (
+            documents[0].content
+            == "This is a test sentence with many many words that exceeds the split length and should not be repeated. "
+        )
+        assert "This is a test sentence with many many words" not in documents[1].content
+        assert "This is a test sentence with many many words" not in documents[2].content
+
+    def test_run_split_by_word_respect_sentence_boundary_with_split_overlap_and_page_breaks(self) -> None:
+        document_splitter = DocumentSplitter(
+            split_by="word",
+            split_length=8,
+            split_overlap=1,
+            split_threshold=0,
+            language="en",
+            use_split_rules=True,
+            extend_abbreviations=True,
+            respect_sentence_boundary=True,
+        )
+
+        text = (
+            "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
+            "Sentence on page 3. Another on page 3.\f\f Sentence on page 5."
+        )
+        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
+
+        assert len(documents) == 6
+        assert documents[0].content == "Sentence on page 1. Another on page 1.\f"
+        assert documents[0].meta["page_number"] == 1
+        assert documents[0].meta["split_id"] == 0
+        assert documents[0].meta["split_idx_start"] == text.index(documents[0].content)
+        assert documents[1].content == "Another on page 1.\fSentence on page 2. "
+        assert documents[1].meta["page_number"] == 1
+        assert documents[1].meta["split_id"] == 1
+        assert documents[1].meta["split_idx_start"] == text.index(documents[1].content)
+        assert documents[2].content == "Sentence on page 2. Another on page 2.\f"
+        assert documents[2].meta["page_number"] == 2
+        assert documents[2].meta["split_id"] == 2
+        assert documents[2].meta["split_idx_start"] == text.index(documents[2].content)
+        assert documents[3].content == "Another on page 2.\fSentence on page 3. "
+        assert documents[3].meta["page_number"] == 2
+        assert documents[3].meta["split_id"] == 3
+        assert documents[3].meta["split_idx_start"] == text.index(documents[3].content)
+        assert documents[4].content == "Sentence on page 3. Another on page 3.\f\f "
+        assert documents[4].meta["page_number"] == 3
+        assert documents[4].meta["split_id"] == 4
+        assert documents[4].meta["split_idx_start"] == text.index(documents[4].content)
+        assert documents[5].content == "Another on page 3.\f\f Sentence on page 5."
+        assert documents[5].meta["page_number"] == 3
+        assert documents[5].meta["split_id"] == 5
+        assert documents[5].meta["split_idx_start"] == text.index(documents[5].content)
+
+    def test_respect_sentence_boundary_checks(self):
+        # this combination triggers the warning
+        splitter = DocumentSplitter(split_by="sentence", split_length=10, respect_sentence_boundary=True)
+        assert splitter.respect_sentence_boundary == False
+
+    def test_sentence_serialization(self):
+        """Test serialization with NLTK sentence splitting configuration and using non-default values"""
+        splitter = DocumentSplitter(
+            split_by="sentence",
+            language="de",
+            use_split_rules=False,
+            extend_abbreviations=False,
+            respect_sentence_boundary=False,
+        )
+        serialized = splitter.to_dict()
+        deserialized = DocumentSplitter.from_dict(serialized)
+
+        assert deserialized.split_by == "sentence"
+        assert hasattr(deserialized, "sentence_splitter")
+        assert deserialized.language == "de"
+        assert deserialized.use_split_rules == False
+        assert deserialized.extend_abbreviations == False
+        assert deserialized.respect_sentence_boundary == False
+
+    def test_nltk_serialization_roundtrip(self):
+        """Test complete serialization roundtrip with actual document splitting"""
+        splitter = DocumentSplitter(
+            split_by="sentence",
+            language="de",
+            use_split_rules=False,
+            extend_abbreviations=False,
+            respect_sentence_boundary=False,
+        )
+        serialized = splitter.to_dict()
+        deserialized_splitter = DocumentSplitter.from_dict(serialized)
+        assert splitter.split_by == deserialized_splitter.split_by
+
+    def test_respect_sentence_boundary_serialization(self):
+        """Test serialization with respect_sentence_boundary option"""
+        splitter = DocumentSplitter(split_by="word", respect_sentence_boundary=True, language="de")
+        serialized = splitter.to_dict()
+        deserialized = DocumentSplitter.from_dict(serialized)
+
+        assert deserialized.respect_sentence_boundary == True
+        assert hasattr(deserialized, "sentence_splitter")
+        assert deserialized.language == "de"
diff --git a/test/components/preprocessors/test_sentence_tokenizer.py b/test/components/preprocessors/test_sentence_tokenizer.py
new file mode 100644
index 0000000000..bf9aab9a9e
--- /dev/null
+++ b/test/components/preprocessors/test_sentence_tokenizer.py
@@ -0,0 +1,67 @@
+import pytest
+from unittest.mock import patch
+from pathlib import Path
+
+from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter
+
+from pytest import LogCaptureFixture
+
+
+def test_apply_split_rules_no_join() -> None:
+    text = "This is a test. This is another test. And a third test."
+    spans = [(0, 15), (16, 36), (37, 54)]
+    result = SentenceSplitter._apply_split_rules(text, spans)
+    assert len(result) == 3
+    assert result == [(0, 15), (16, 36), (37, 54)]
+
+
+def test_apply_split_rules_join_case_1():
+    text = 'He said "This is sentence one. This is sentence two." Then he left.'
+    result = SentenceSplitter._apply_split_rules(text, [(0, 30), (31, 53), (54, 67)])
+    assert len(result) == 2
+    assert result == [(0, 53), (54, 67)]
+
+
+def test_apply_split_rules_join_case_3():
+    splitter = SentenceSplitter(language="en", use_split_rules=True)
+    text = """
+    1. First item
+    2. Second item
+    3. Third item."""
+    spans = [(0, 7), (8, 25), (26, 44), (45, 56)]
+    result = splitter._apply_split_rules(text, spans)
+    assert len(result) == 1
+    assert result == [(0, 56)]
+
+
+def test_apply_split_rules_join_case_4() -> None:
+    text = "This is a test. (With a parenthetical statement.) And another sentence."
+    spans = [(0, 15), (16, 50), (51, 74)]
+    result = SentenceSplitter._apply_split_rules(text, spans)
+    assert len(result) == 2
+    assert result == [(0, 50), (51, 74)]
+
+
+@pytest.fixture
+def mock_file_content():
+    return "Mr.\nDr.\nProf."
+
+
+def test_read_abbreviations_existing_file(tmp_path, mock_file_content):
+    abbrev_dir = tmp_path / "data" / "abbreviations"
+    abbrev_dir.mkdir(parents=True)
+    abbrev_file = abbrev_dir / f"en.txt"
+    abbrev_file.write_text(mock_file_content)
+
+    with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path:
+        mock_path.return_value.parent.parent = tmp_path
+        result = SentenceSplitter._read_abbreviations("en")
+        assert result == ["Mr.", "Dr.", "Prof."]
+
+
+def test_read_abbreviations_missing_file(caplog: LogCaptureFixture):
+    with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path:
+        mock_path.return_value.parent.parent = Path("/nonexistent")
+        result = SentenceSplitter._read_abbreviations("pt")
+        assert result == []
+        assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text
diff --git a/test/components/retrievers/test_sentence_window_retriever.py b/test/components/retrievers/test_sentence_window_retriever.py
index 4979fa334d..04e03befbe 100644
--- a/test/components/retrievers/test_sentence_window_retriever.py
+++ b/test/components/retrievers/test_sentence_window_retriever.py
@@ -176,7 +176,7 @@ def test_context_documents_returned_are_ordered_by_split_idx_start(self):
 
     @pytest.mark.integration
     def test_run_with_pipeline(self):
-        splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by="sentence")
+        splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by="period")
         text = (
             "This is a text with some words. There is a second sentence. And there is also a third sentence. "
             "It also contains a fourth sentence. And a fifth sentence. And a sixth sentence. And a seventh sentence"