From d49463ba0037c76928c9b687bd21fb86e8e910a2 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Tue, 24 Sep 2024 17:06:19 -0700
Subject: [PATCH 01/11] feat: add justext

---
 src/datatrove/pipeline/extractors/__init__.py |  1 +
 src/datatrove/pipeline/extractors/justext.py  | 88 +++++++++++++++++++
 tests/pipeline/test_extractors.py             |  9 +-
 tests/utils.py                                |  8 ++
 4 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 src/datatrove/pipeline/extractors/justext.py

diff --git a/src/datatrove/pipeline/extractors/__init__.py b/src/datatrove/pipeline/extractors/__init__.py
index 9620bd2f..262687a5 100644
--- a/src/datatrove/pipeline/extractors/__init__.py
+++ b/src/datatrove/pipeline/extractors/__init__.py
@@ -1,2 +1,3 @@
+from .justext import Justext
 from .modular import ReadabilityInscriptis
 from .trafilatura import Trafilatura
diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py
new file mode 100644
index 00000000..47bd49bc
--- /dev/null
+++ b/src/datatrove/pipeline/extractors/justext.py
@@ -0,0 +1,88 @@
+from .base import BaseExtractor
+
+
+class Justext(BaseExtractor):
+    """Justext extractor, it uses https://github.com/miso-belica/jusText
+
+    We're actually only using the main entry point of justext: the `justext` function.
+    No specific data structure is exchanged with Justext, only the text is passed and the extracted text is returned.
+    Alternatively and identically, `justext` could be used through its command line main interface.
+
+    Args:
+        length_low: the minimum length of a paragraph
+        length_high: the maximum length of a paragraph
+        stopwords_low: the minimum stopwords ratio of a paragraph
+        stopwords_high: the maximum stopwords ratio of a paragraph
+        max_link_density: the maximum link density of a paragraph
+        max_heading_distance: the maximum distance between headings of a paragraph
+        no_headings: whether to remove headings from the extracted text
+        remove_boilerplate: whether to remove boilerplate from the extracted text
+        kwargs: any other option will be passed to justext
+        timeout: the timeout for extraction, per document, in seconds
+    """
+
+    name = "⛏ Justext"
+    _requires_dependencies = ["justext"]
+
+    def __init__(
+        self,
+        stoplist: list[str] = None,
+        length_low: int = 70,
+        length_high: int = 200,
+        stopwords_low: float = 0.3,
+        stopwords_high: float = 0.32,
+        max_link_density: float = 0.2,
+        max_heading_distance: int = 200,
+        no_headings: bool = False,
+        remove_boilerplate: bool = True,
+        timeout: float = 0.1,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        if stoplist is None:
+            stoplist = self.get_stoplist(lang="english")
+        self.stoplist = frozenset(stoplist)
+        self.length_low = length_low
+        self.length_high = length_high
+        self.stopwords_low = stopwords_low
+        self.stopwords_high = stopwords_high
+        self.max_link_density = max_link_density
+        self.max_heading_distance = max_heading_distance
+        self.no_headings = no_headings
+        self.remove_boilerplate = remove_boilerplate
+        self.kwargs = kwargs
+
+    @staticmethod
+    def get_stoplist(lang: str = "english") -> list[str]:
+        from justext import get_stoplist
+
+        return get_stoplist(lang)
+
+    def extract(self, text: str) -> str:
+        """
+
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
+        from justext import justext
+
+        paragraphs = justext(
+            text,
+            stoplist=self.stoplist,
+            length_low=self.length_low,
+            length_high=self.length_high,
+            stopwords_low=self.stopwords_low,
+            stopwords_high=self.stopwords_high,
+            max_link_density=self.max_link_density,
+            max_heading_distance=self.max_heading_distance,
+            no_headings=self.no_headings,
+            **self.kwargs,
+        )
+
+        # Join text blocks with double newlines to separate them
+        if self.remove_boilerplate:
+            return "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate])
+        else:
+            return "\n\n".join([p.text for p in paragraphs])
diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py
index ec7e1417..bdfe7b17 100644
--- a/tests/pipeline/test_extractors.py
+++ b/tests/pipeline/test_extractors.py
@@ -1,8 +1,8 @@
 import unittest
 
-from datatrove.pipeline.extractors import ReadabilityInscriptis, Trafilatura
+from datatrove.pipeline.extractors import Justext, ReadabilityInscriptis, Trafilatura
 
-from ..utils import require_inscriptis, require_readability, require_trafilatura
+from ..utils import require_inscriptis, require_justext, require_readability, require_trafilatura
 
 
 ARTICLE_HTML = "<html><body><article><p>Hello World!</p></article></body></html>"
@@ -19,3 +19,8 @@ def test_basic_article_trafilatura(self):
     def test_basic_article_readability(self):
         extractor = ReadabilityInscriptis(min_text_length=10, min_text_score=1)
         self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
+
+    @require_justext
+    def test_basic_article_justext(self):
+        extractor = Justext(remove_boilerplate=False)
+        self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
diff --git a/tests/utils.py b/tests/utils.py
index 3d076308..ddae9d85 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -55,6 +55,14 @@ def require_trafilatura(test_case):
     return test_case
 
 
+def require_justext(test_case):
+    try:
+        import justext  # noqa: F401
+    except ImportError:
+        test_case = unittest.skip("test requires justext")(test_case)
+    return test_case
+
+
 def require_readability(test_case):
     try:
         import readability  # noqa: F401

From 9fbc6b23791a64b506931f637f0eaa174b945149 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Tue, 24 Sep 2024 17:13:42 -0700
Subject: [PATCH 02/11] fix: remove justext cli comment

---
 src/datatrove/pipeline/extractors/justext.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py
index 47bd49bc..616e9e60 100644
--- a/src/datatrove/pipeline/extractors/justext.py
+++ b/src/datatrove/pipeline/extractors/justext.py
@@ -6,7 +6,6 @@ class Justext(BaseExtractor):
 
     We're actually only using the main entry point of justext: the `justext` function.
     No specific data structure is exchanged with Justext, only the text is passed and the extracted text is returned.
-    Alternatively and identically, `justext` could be used through its command line main interface.
 
     Args:
         length_low: the minimum length of a paragraph

From a6cce5d2893bf88dfd20be2795cee00d28a3875c Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Tue, 24 Sep 2024 17:41:27 -0700
Subject: [PATCH 03/11] feat: add resiliparse

---
 src/datatrove/pipeline/extractors/__init__.py |  1 +
 .../pipeline/extractors/resiliparse.py        | 74 +++++++++++++++++++
 tests/pipeline/test_extractors.py             |  9 ++-
 tests/utils.py                                |  8 ++
 4 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 src/datatrove/pipeline/extractors/resiliparse.py

diff --git a/src/datatrove/pipeline/extractors/__init__.py b/src/datatrove/pipeline/extractors/__init__.py
index 262687a5..72714e50 100644
--- a/src/datatrove/pipeline/extractors/__init__.py
+++ b/src/datatrove/pipeline/extractors/__init__.py
@@ -1,3 +1,4 @@
 from .justext import Justext
 from .modular import ReadabilityInscriptis
+from .resiliparse import Resiliparse
 from .trafilatura import Trafilatura
diff --git a/src/datatrove/pipeline/extractors/resiliparse.py b/src/datatrove/pipeline/extractors/resiliparse.py
new file mode 100644
index 00000000..74c2a997
--- /dev/null
+++ b/src/datatrove/pipeline/extractors/resiliparse.py
@@ -0,0 +1,74 @@
+from .base import BaseExtractor
+
+
+class Resiliparse(BaseExtractor):
+    """
+    Resiliparse extractor, it uses https://resiliparse.chatnoir.eu/en/latest/index.html
+
+    We're actually only using the main entry point of resiliparse's text extraction: the `extract_plain_text` function.
+    No specific data structure is exchanged with Resiliparse, only the text is passed and the extracted text is returned.
+
+    Args:
+        timeout: the timeout for extraction, per document, in seconds
+        preserve_formatting: whether to preserve the formatting of the text
+        main_content: whether to extract the main content of the document
+        list_bullets: whether to extract the bullets of the document
+        alt_texts: whether to extract the alt texts of the document
+        links: whether to extract the links of the document
+        form_fields: whether to extract the form fields of the document
+        noscript: whether to extract the noscript of the document
+        comments: whether to extract the comments that are present in the document
+        skip_elements: whether to skip the elements of the document
+    """
+
+    name = "⛏ Resiliparse"
+    _requires_dependencies = ["resiliparse"]
+
+    def __init__(
+        self,
+        preserve_formatting: bool = True,
+        main_content: bool = True,
+        list_bullets: bool = True,
+        alt_texts: bool = False,
+        links: bool = False,
+        form_fields: bool = False,
+        noscript: bool = False,
+        comments: bool = True,
+        skip_elements: list = None,
+        timeout: float = 0.1,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        self.preserve_formatting = preserve_formatting
+        self.main_content = main_content
+        self.list_bullets = list_bullets
+        self.alt_texts = alt_texts
+        self.links = links
+        self.form_fields = form_fields
+        self.noscript = noscript
+        self.comments = comments
+        self.skip_elements = skip_elements
+
+    def extract(self, text: str) -> str:
+        """
+
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+
+        """
+        from resiliparse.extract.html2text import extract_plain_text
+
+        return extract_plain_text(
+            text,
+            preserve_formatting=self.preserve_formatting,
+            main_content=self.main_content,
+            list_bullets=self.list_bullets,
+            alt_texts=self.alt_texts,
+            links=self.links,
+            form_fields=self.form_fields,
+            noscript=self.noscript,
+            comments=self.comments,
+            skip_elements=self.skip_elements,
+        )
diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py
index bdfe7b17..dbf4fb5b 100644
--- a/tests/pipeline/test_extractors.py
+++ b/tests/pipeline/test_extractors.py
@@ -1,8 +1,8 @@
 import unittest
 
-from datatrove.pipeline.extractors import Justext, ReadabilityInscriptis, Trafilatura
+from datatrove.pipeline.extractors import Justext, ReadabilityInscriptis, Resiliparse, Trafilatura
 
-from ..utils import require_inscriptis, require_justext, require_readability, require_trafilatura
+from ..utils import require_inscriptis, require_justext, require_readability, require_resiliparse, require_trafilatura
 
 
 ARTICLE_HTML = "<html><body><article><p>Hello World!</p></article></body></html>"
@@ -24,3 +24,8 @@ def test_basic_article_readability(self):
     def test_basic_article_justext(self):
         extractor = Justext(remove_boilerplate=False)
         self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
+
+    @require_resiliparse
+    def test_basic_article_resiliparse(self):
+        extractor = Resiliparse()
+        self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
diff --git a/tests/utils.py b/tests/utils.py
index ddae9d85..07e63a22 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -71,6 +71,14 @@ def require_readability(test_case):
     return test_case
 
 
+def require_resiliparse(test_case):
+    try:
+        import resiliparse  # noqa: F401
+    except ImportError:
+        test_case = unittest.skip("test requires resiliparse")(test_case)
+    return test_case
+
+
 def require_inscriptis(test_case):
     try:
         import inscriptis  # noqa: F401

From add6807ce3e7764bfee2f3f10c0af435d8d1be85 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Thu, 26 Sep 2024 17:49:14 -0700
Subject: [PATCH 04/11] feat: add inscriptis

---
 .../pipeline/extractors/inscriptis.py         | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 src/datatrove/pipeline/extractors/inscriptis.py

diff --git a/src/datatrove/pipeline/extractors/inscriptis.py b/src/datatrove/pipeline/extractors/inscriptis.py
new file mode 100644
index 00000000..721b308c
--- /dev/null
+++ b/src/datatrove/pipeline/extractors/inscriptis.py
@@ -0,0 +1,63 @@
+import re
+
+from .base import BaseExtractor
+
+
+class Inscriptis(BaseExtractor):
+    """Inscriptis extractor, it uses https://github.com/weblyzard/inscriptis
+
+    We're using the main entry point of inscriptis: the `get_text` function.
+    No specific data structure is exchanged with Inscriptis, only the HTML is passed and the extracted text is returned.
+
+    Args:
+        timeout: the timeout for extraction, per document, in seconds
+        deduplicate_captions: whether to remove duplicate captions
+        display_links: whether to display link targets
+        display_anchors: whether to display anchor texts
+        **kwargs: any other option will be passed to inscriptis
+    """
+
+    name = "⛏ Inscriptis"
+    _requires_dependencies = ["inscriptis"]
+
+    def __init__(
+        self,
+        timeout: float = 0.1,
+        max_new_lines: int = 2,
+        deduplicate_captions: bool = True,
+        display_links: bool = False,
+        display_anchors: bool = True,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        self.new_line_chars = "\n" * max_new_lines
+        self.deduplicate_captions = deduplicate_captions
+        self.display_links = display_links
+        self.display_anchors = display_anchors
+        self.kwargs = kwargs
+        self.regex_excessive_lines = re.compile(r"(" + self.new_line_chars + "\n+)")
+
+    def extract(self, text: str) -> str:
+        """
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
+        from inscriptis import get_text
+        from inscriptis.css_profiles import CSS_PROFILES
+        from inscriptis.model.config import ParserConfig
+
+        text = get_text(
+            html_content=text,
+            config=ParserConfig(
+                css=CSS_PROFILES["strict"],
+                deduplicate_captions=self.deduplicate_captions,
+                display_links=self.display_links,
+                display_anchors=self.display_anchors,
+                **self.kwargs,
+            ),
+        )
+
+        # remove excessive empty lines
+        return self.regex_excessive_lines.sub(self.new_line_chars, text).strip()

From e3a728552490348cfb381a5553c73550720a7e86 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Thu, 26 Sep 2024 17:49:22 -0700
Subject: [PATCH 05/11] feat: add readabilipy

---
 .../pipeline/extractors/readabilipy.py        | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 src/datatrove/pipeline/extractors/readabilipy.py

diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py
new file mode 100644
index 00000000..5b1c7861
--- /dev/null
+++ b/src/datatrove/pipeline/extractors/readabilipy.py
@@ -0,0 +1,57 @@
+from .base import BaseExtractor
+
+
+class ReadabiliPy(BaseExtractor):
+    """ReadabiliPy extractor, it uses https://github.com/alan-turing-institute/ReadabiliPy
+
+    We're using the main entry point of ReadabiliPy: the `simple_json_from_html_string` function.
+    The extracted content is returned as plain text.
+
+    Args:
+        timeout: the timeout for extraction, per document, in seconds
+        use_readability: whether to use Mozilla's Readability.js (requires Node.js)
+        content_digests: whether to include content digests in the output
+        node_indexes: whether to include node indexes in the output
+        **kwargs: any other option will be passed to ReadabiliPy
+    """
+
+    name = "⛏ ReadabiliPy"
+    _requires_dependencies = ["readabilipy"]
+
+    def __init__(
+        self,
+        timeout: float = 0.1,
+        use_readability: bool = False,
+        content_digests: bool = False,
+        node_indexes: bool = False,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        self.use_readability = use_readability
+        self.content_digests = content_digests
+        self.node_indexes = node_indexes
+        self.kwargs = kwargs
+
+    def extract(self, text: str) -> str:
+        """
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
+        from readabilipy import simple_json_from_html_string
+
+        result = simple_json_from_html_string(
+            text,
+            use_readability=self.use_readability,
+            content_digests=self.content_digests,
+            node_indexes=self.node_indexes,
+            **self.kwargs,
+        )
+
+        content = result.get("plain_text", "")
+
+        if isinstance(content, list):
+            content = "\n\n".join(block["text"] for block in content)
+
+        return content

From 2a6ef15509f309b0c04d63f4dbf7cacb56c6bea3 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Thu, 26 Sep 2024 17:49:32 -0700
Subject: [PATCH 06/11] feat: add readability

---
 .../pipeline/extractors/readability.py        | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 src/datatrove/pipeline/extractors/readability.py

diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py
new file mode 100644
index 00000000..7ff29381
--- /dev/null
+++ b/src/datatrove/pipeline/extractors/readability.py
@@ -0,0 +1,58 @@
+from .base import BaseExtractor
+
+
+class Readability(BaseExtractor):
+    """Readability extractor, it uses https://github.com/buriy/python-readability
+
+    We're using the main entry point of readability-lxml: the `Document` class.
+    No specific data structure is exchanged with Readability, only the HTML is passed and the extracted text is returned.
+
+    Args:
+        timeout: the timeout for extraction, per document, in seconds
+        min_text_length: the minimum length of text to consider
+        retry_length: number of chars to use when searching for body
+        url: the URL of the page (optional, used for better parsing)
+        keep_classes: list of classes to keep in the extracted content
+        **kwargs: any other option will be passed to readability
+    """
+
+    name = "⛏ Readability"
+    _requires_dependencies = ["readability"]
+
+    def __init__(
+        self,
+        timeout: float = 0.1,
+        min_text_length: int = 25,
+        retry_length: int = 250,
+        url: str = None,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        self.min_text_length = min_text_length
+        self.retry_length = retry_length
+        self.url = url
+        self.kwargs = kwargs
+
+    def extract(self, text: str, postprocessor: BaseExtractor) -> str:
+        """
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
+        from readability import Document
+
+        if not postprocessor:
+            raise ValueError("A postprocessor (extractor) must be provided")
+
+        doc = Document(
+            text,
+            min_text_length=self.min_text_length,
+            retry_length=self.retry_length,
+            url=self.url,
+            **self.kwargs,
+        )
+
+        cleaned_html = doc.summary()
+
+        return postprocessor.extract(cleaned_html)

From 84f1ed4cf13206dc8c9b8e8acff2b5f79dfeb387 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Thu, 26 Sep 2024 17:49:52 -0700
Subject: [PATCH 07/11] feat: add require_readability to utils

---
 tests/utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/utils.py b/tests/utils.py
index 07e63a22..4d00755a 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -87,6 +87,14 @@ def require_inscriptis(test_case):
     return test_case
 
 
+def require_readabilipy(test_case):
+    try:
+        import readabilipy  # noqa: F401
+    except ImportError:
+        test_case = unittest.skip("test requires readabilipy")(test_case)
+    return test_case
+
+
 def require_pyarrow(test_case):
     try:
         import pyarrow  # noqa: F401

From c085736e8bae20b92f7475bd76292ac51f128b37 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Thu, 26 Sep 2024 17:50:04 -0700
Subject: [PATCH 08/11] feat: add tests

---
 tests/pipeline/test_extractors.py | 39 +++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py
index dbf4fb5b..b306e1e8 100644
--- a/tests/pipeline/test_extractors.py
+++ b/tests/pipeline/test_extractors.py
@@ -1,8 +1,23 @@
 import unittest
 
-from datatrove.pipeline.extractors import Justext, ReadabilityInscriptis, Resiliparse, Trafilatura
-
-from ..utils import require_inscriptis, require_justext, require_readability, require_resiliparse, require_trafilatura
+from datatrove.pipeline.extractors import (
+    Inscriptis,
+    Justext,
+    ReadabiliPy,
+    Readability,
+    ReadabilityInscriptis,
+    Resiliparse,
+    Trafilatura,
+)
+
+from ..utils import (
+    require_inscriptis,
+    require_justext,
+    require_readabilipy,
+    require_readability,
+    require_resiliparse,
+    require_trafilatura,
+)
 
 
 ARTICLE_HTML = "<html><body><article><p>Hello World!</p></article></body></html>"
@@ -16,7 +31,7 @@ def test_basic_article_trafilatura(self):
 
     @require_readability
     @require_inscriptis
-    def test_basic_article_readability(self):
+    def test_basic_article_readability_inscriptis(self):
         extractor = ReadabilityInscriptis(min_text_length=10, min_text_score=1)
         self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
 
@@ -29,3 +44,19 @@ def test_basic_article_justext(self):
     def test_basic_article_resiliparse(self):
         extractor = Resiliparse()
         self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
+
+    @require_readabilipy
+    def test_basic_article_readabilipy(self):
+        extractor = ReadabiliPy()
+        self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
+
+    @require_inscriptis
+    def test_basic_article_inscriptis(self):
+        extractor = Inscriptis()
+        self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
+
+    @require_readability
+    def test_basic_article_readability(self):
+        extractor = Readability(min_text_length=10, min_text_score=1)
+        postprocessor = Trafilatura()
+        self.assertEqual(extractor.extract(ARTICLE_HTML, postprocessor=postprocessor), "Hello World!")

From ea3a9154cf4b40cc48b7ff7d9dcd46b12531bc12 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Thu, 26 Sep 2024 17:50:17 -0700
Subject: [PATCH 09/11] feat: changed configs & pyproject

---
 pyproject.toml                                | 3 +++
 src/datatrove/pipeline/extractors/__init__.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index cf226903..b29025ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,9 @@ processing = [
 #   "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
     "tldextract",
     "trafilatura>=1.8.0,<1.12.0",
+    "justext",
+    "resiliparse",
+    "readabilipy",
     "tokenizers",
     "ftfy",
     "fasteners",
diff --git a/src/datatrove/pipeline/extractors/__init__.py b/src/datatrove/pipeline/extractors/__init__.py
index 72714e50..c4ef7a89 100644
--- a/src/datatrove/pipeline/extractors/__init__.py
+++ b/src/datatrove/pipeline/extractors/__init__.py
@@ -1,4 +1,7 @@
+from .inscriptis import Inscriptis
 from .justext import Justext
 from .modular import ReadabilityInscriptis
+from .readabilipy import ReadabiliPy
+from .readability import Readability
 from .resiliparse import Resiliparse
 from .trafilatura import Trafilatura

From 891850e38d705fcc58ea25dab4895376d837c0d9 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Thu, 26 Sep 2024 19:55:38 -0700
Subject: [PATCH 10/11] fix: move postprocessor to init

---
 src/datatrove/pipeline/extractors/readability.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py
index 7ff29381..f993220f 100644
--- a/src/datatrove/pipeline/extractors/readability.py
+++ b/src/datatrove/pipeline/extractors/readability.py
@@ -4,8 +4,10 @@
 class Readability(BaseExtractor):
     """Readability extractor, it uses https://github.com/buriy/python-readability
 
-    We're using the main entry point of readability-lxml: the `Document` class.
-    No specific data structure is exchanged with Readability, only the HTML is passed and the extracted text is returned.
+    We're using the main entry point of readability-lxml: the `Document` class, which cleans up the HTML and outputs a
+    cleaned HTML string.
+
+    The postprocessor (another Datatrove extractor) is used to convert the cleaned HTML to plain text
 
     Args:
         timeout: the timeout for extraction, per document, in seconds
@@ -21,6 +23,7 @@ class Readability(BaseExtractor):
 
     def __init__(
         self,
+        postprocessor: BaseExtractor,
         timeout: float = 0.1,
         min_text_length: int = 25,
         retry_length: int = 250,
@@ -28,12 +31,13 @@ def __init__(
         **kwargs,
     ):
         super().__init__(timeout)
+        self.postprocessor = postprocessor
         self.min_text_length = min_text_length
         self.retry_length = retry_length
         self.url = url
         self.kwargs = kwargs
 
-    def extract(self, text: str, postprocessor: BaseExtractor) -> str:
+    def extract(self, text: str) -> str:
         """
         Args:
           text: str: html content
@@ -42,9 +46,6 @@ def extract(self, text: str, postprocessor: BaseExtractor) -> str:
         """
         from readability import Document
 
-        if not postprocessor:
-            raise ValueError("A postprocessor (extractor) must be provided")
-
         doc = Document(
             text,
             min_text_length=self.min_text_length,
@@ -55,4 +56,4 @@ def extract(self, text: str, postprocessor: BaseExtractor) -> str:
 
         cleaned_html = doc.summary()
 
-        return postprocessor.extract(cleaned_html)
+        return self.postprocessor.extract(cleaned_html)

From 76816c8a0bb6c153bb906f5f0af3bf22b2956566 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Wed, 11 Dec 2024 04:15:04 +0000
Subject: [PATCH 11/11] feat: implement clean_html method in extractors and
 update Inscriptis initialization

- Added a default `clean_html` method to the `BaseExtractor` class, providing a warning for extractors that do not implement their own.
- Implemented specific `clean_html` methods in `Inscriptis`, `Justext`, `ReadabiliPy`, `Readability`, and `Trafilatura` extractors to handle HTML cleaning.
- Updated the `Inscriptis` extractor to accept a preprocessor during initialization.
- Modified the `extract` methods in `ReadabiliPy` and `Readability` to utilize the new `clean_html` method.
- Adjusted the `Justext` extractor to remove the default English language parameter from `get_stoplist`.
- Updated tests to reflect changes in extractor initialization and functionality.
---
 pyproject.toml                                |  3 +-
 src/datatrove/pipeline/extractors/base.py     | 15 ++++++++++
 .../pipeline/extractors/inscriptis.py         |  6 +++-
 src/datatrove/pipeline/extractors/justext.py  | 20 +++++++++++--
 .../pipeline/extractors/readabilipy.py        | 27 +++++++++++------
 .../pipeline/extractors/readability.py        | 20 ++++++++-----
 .../pipeline/extractors/trafilatura.py        | 15 ++++++++++
 src/datatrove/pipeline/tokens/tokenizer.py    | 29 ++++++++++++++++++-
 tests/pipeline/test_extractors.py             |  7 +++--
 9 files changed, 117 insertions(+), 25 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b29025ac..270b82cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,9 +50,10 @@ s3 = [
 ]
 processing = [
     "fasttext-wheel",
+    "brotlipy",
     "nltk",
     "inscriptis",
-#   "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
+    "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
     "tldextract",
     "trafilatura>=1.8.0,<1.12.0",
     "justext",
diff --git a/src/datatrove/pipeline/extractors/base.py b/src/datatrove/pipeline/extractors/base.py
index d3622d79..d83eba21 100644
--- a/src/datatrove/pipeline/extractors/base.py
+++ b/src/datatrove/pipeline/extractors/base.py
@@ -34,6 +34,21 @@ def extract(self, text: str) -> str:
         """
         pass
 
+    def clean_html(self, html: str) -> str:
+        """Default implementation of `clean_html` for extractors that don't return a cleaned version of the HTML
+
+        Since not all extractors produce a cleaned version of the HTML as a part of the extraction process,
+        this default implementation throws a warning and simply returns the original HTML string.
+
+        Args:
+            html: str: the HTML content to clean
+
+        Returns:
+            str: the cleaned HTML
+        """
+        logger.warning(f"{self.name} doesn't have a clean_html() method by default. Skipping...")
+        return html
+
     def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> DocumentsPipeline:
         """Iterates through each document in data and calls `timeout_extract` on it.
 
diff --git a/src/datatrove/pipeline/extractors/inscriptis.py b/src/datatrove/pipeline/extractors/inscriptis.py
index 721b308c..0e3e2c6f 100644
--- a/src/datatrove/pipeline/extractors/inscriptis.py
+++ b/src/datatrove/pipeline/extractors/inscriptis.py
@@ -22,6 +22,7 @@ class Inscriptis(BaseExtractor):
 
     def __init__(
         self,
+        preprocessor: BaseExtractor,
         timeout: float = 0.1,
         max_new_lines: int = 2,
         deduplicate_captions: bool = True,
@@ -30,6 +31,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(timeout)
+        self.preprocessor = preprocessor
         self.new_line_chars = "\n" * max_new_lines
         self.deduplicate_captions = deduplicate_captions
         self.display_links = display_links
@@ -48,8 +50,10 @@ def extract(self, text: str) -> str:
         from inscriptis.css_profiles import CSS_PROFILES
         from inscriptis.model.config import ParserConfig
 
+        cleaned_html = self.preprocessor.clean_html(text)
+
         text = get_text(
-            html_content=text,
+            html_content=cleaned_html,
             config=ParserConfig(
                 css=CSS_PROFILES["strict"],
                 deduplicate_captions=self.deduplicate_captions,
diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py
index 616e9e60..05e71914 100644
--- a/src/datatrove/pipeline/extractors/justext.py
+++ b/src/datatrove/pipeline/extractors/justext.py
@@ -39,7 +39,7 @@ def __init__(
     ):
         super().__init__(timeout)
         if stoplist is None:
-            stoplist = self.get_stoplist(lang="english")
+            stoplist = self.get_stoplist()
         self.stoplist = frozenset(stoplist)
         self.length_low = length_low
         self.length_high = length_high
@@ -52,10 +52,26 @@ def __init__(
         self.kwargs = kwargs
 
     @staticmethod
-    def get_stoplist(lang: str = "english") -> list[str]:
+    def get_stoplist(lang: str = "English") -> list[str]:
         from justext import get_stoplist
 
         return get_stoplist(lang)
+    
+    def clean_html(self, html: str) -> str:
+        """
+
+        Args:
+          html: str: html content
+
+        Returns: cleaned HTML
+        """
+        from justext.core import html_to_dom, preprocessor
+        from lxml.html import tostring
+
+        dom = html_to_dom(html)
+        dom = preprocessor(dom)
+        cleaned_html = tostring(dom).decode()
+        return cleaned_html
 
     def extract(self, text: str) -> str:
         """
diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py
index 5b1c7861..c5e345b7 100644
--- a/src/datatrove/pipeline/extractors/readabilipy.py
+++ b/src/datatrove/pipeline/extractors/readabilipy.py
@@ -32,6 +32,20 @@ def __init__(
         self.node_indexes = node_indexes
         self.kwargs = kwargs
 
+    def clean_html(self, html: str) -> str:
+        """
+
+        Args:
+          html: str: html content
+
+        Returns: cleaned HTML
+        """
+        from readabilipy import simple_tree_from_html_string    
+
+        result = simple_tree_from_html_string(html)
+        return str(result)
+
+
     def extract(self, text: str) -> str:
         """
         Args:
@@ -39,17 +53,12 @@ def extract(self, text: str) -> str:
 
         Returns: plaintext extracted text
         """
-        from readabilipy import simple_json_from_html_string
+        from readabilipy.simple_json import plain_content, extract_text_blocks_as_plain_text
 
-        result = simple_json_from_html_string(
-            text,
-            use_readability=self.use_readability,
-            content_digests=self.content_digests,
-            node_indexes=self.node_indexes,
-            **self.kwargs,
-        )
+        cleaned_html = self.clean_html(text)
 
-        content = result.get("plain_text", "")
+        pl_content = plain_content(cleaned_html, self.content_digests, self.node_indexes)
+        content = extract_text_blocks_as_plain_text(pl_content)
 
         if isinstance(content, list):
             content = "\n\n".join(block["text"] for block in content)
diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py
index f993220f..d528a586 100644
--- a/src/datatrove/pipeline/extractors/readability.py
+++ b/src/datatrove/pipeline/extractors/readability.py
@@ -37,13 +37,7 @@ def __init__(
         self.url = url
         self.kwargs = kwargs
 
-    def extract(self, text: str) -> str:
-        """
-        Args:
-          text: str: html content
-
-        Returns: plaintext extracted text
-        """
+    def clean_html(self, text: str) -> str:
         from readability import Document
 
         doc = Document(
@@ -54,6 +48,16 @@ def extract(self, text: str) -> str:
             **self.kwargs,
         )
 
-        cleaned_html = doc.summary()
+        return doc.summary()
+
+
+    def extract(self, text: str) -> str:
+        """
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
 
+        cleaned_html = self.clean_html(text)
         return self.postprocessor.extract(cleaned_html)
diff --git a/src/datatrove/pipeline/extractors/trafilatura.py b/src/datatrove/pipeline/extractors/trafilatura.py
index dbebd62c..d02cecd1 100644
--- a/src/datatrove/pipeline/extractors/trafilatura.py
+++ b/src/datatrove/pipeline/extractors/trafilatura.py
@@ -34,6 +34,21 @@ def __init__(
         self.kwargs = kwargs
         if self.include_images:
             raise NotImplementedError
+        
+    def clean_html(self, html: str) -> str:
+        """
+
+        Args:
+          html: str: html content
+
+        Returns: cleaned HTML
+        """
+        from trafilatura import bare_extraction
+        from xml.etree import ElementTree
+
+        html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)['body']
+        cleaned_html = ElementTree.tostring(html_body, encoding = "unicode")
+        return cleaned_html
 
     def extract(self, text: str) -> str:
         """
diff --git a/src/datatrove/pipeline/tokens/tokenizer.py b/src/datatrove/pipeline/tokens/tokenizer.py
index d6040a04..3ae80080 100644
--- a/src/datatrove/pipeline/tokens/tokenizer.py
+++ b/src/datatrove/pipeline/tokens/tokenizer.py
@@ -283,6 +283,7 @@ def __init__(
         save_filename: str = None,  # if defined, the final output filename will be this
         tokenizer_name_or_path: str = "gpt2",  # tokenizer to use, from HF or a local
         eos_token: str = "<|endoftext|>",  # whether to add the EOS token after each document
+        r2l_digits: bool = False,  # whether to tokenize digits from right to left
         save_loss_metadata: bool = False,  # save the loss information
         shuffle: bool = True,  # whether to shuffle documents in the dataset,
         batch_size: int = 10000,  # batch size for tokenization
@@ -305,6 +306,7 @@ def __init__(
         self.save_filename = save_filename
         self.tokenizer_name_or_path = tokenizer_name_or_path
         self.eos_token = eos_token
+        self.r2l_digits = r2l_digits
         self.save_loss_metadata = save_loss_metadata
         self.shuffle = shuffle
         self.batch_size = batch_size
@@ -343,8 +345,29 @@ def write_unshuffled(self, data: DocumentsPipeline, filename: str):
             data (DocumentsPipeline): the documents to process
             filename (str): the filename to use for the output file
         """
+        import re
+
         from tokenizers import Encoding
 
+        R2L_COMMA_TOKEN = "<R2L-COMMA>"
+
+        self.tokenizer.add_special_tokens([R2L_COMMA_TOKEN])
+        R2L_COMMA_TOKEN_ID = self.tokenizer.token_to_id(R2L_COMMA_TOKEN)
+
+        def add_commas(match):
+            import sys
+
+            sys.set_int_max_str_digits(1000000)
+
+            num, decimal = match.groups()
+            decimal = decimal if decimal else ""
+            return f"{int(num):,}{decimal}".replace(",", R2L_COMMA_TOKEN)
+
+        def preprocess_text(text: str):
+            number_regex = re.compile(r"([\d]+)(\.\d+)?")
+            processed_text = number_regex.sub(add_commas, text)
+            return processed_text
+
         unshuff = TokenizedFile(
             self.output_folder if not self.shuffle or not self.local_working_dir else self.local_working_dir,
             filename,
@@ -358,9 +381,13 @@ def write_unshuffled(self, data: DocumentsPipeline, filename: str):
         # tokenize document's text in batches to go faster – we compute loss values independently if needed
         for batch in batched(data, self.batch_size):
             with self.track_time(unit="batch"):
-                encoded_batch: list[Encoding] = self.tokenizer.encode_batch([document.text for document in batch])
+                encoded_batch: list[Encoding] = self.tokenizer.encode_batch(
+                    [preprocess_text(document.text) if self.r2l_digits else document.text for document in batch]
+                )
                 for document, encoded in zip(batch, encoded_batch):
                     tokens = encoded.ids
+                    if self.r2l_digits:
+                        tokens = [tok for tok in tokens if tok != R2L_COMMA_TOKEN_ID]
                     loss_values = self.get_loss_values(document, encoded)
                     if loss_values is not None and len(loss_values) < len(tokens):
                         # crop final section without loss
diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py
index b306e1e8..de9e76dd 100644
--- a/tests/pipeline/test_extractors.py
+++ b/tests/pipeline/test_extractors.py
@@ -52,11 +52,12 @@ def test_basic_article_readabilipy(self):
 
     @require_inscriptis
     def test_basic_article_inscriptis(self):
-        extractor = Inscriptis()
+        preprocessor = Trafilatura()
+        extractor = Inscriptis(preprocessor=preprocessor)
         self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")
 
     @require_readability
     def test_basic_article_readability(self):
-        extractor = Readability(min_text_length=10, min_text_score=1)
         postprocessor = Trafilatura()
-        self.assertEqual(extractor.extract(ARTICLE_HTML, postprocessor=postprocessor), "Hello World!")
+        extractor = Readability(postprocessor=postprocessor, min_text_length=10, min_text_score=1)
+        self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")