From d49463ba0037c76928c9b687bd21fb86e8e910a2 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Tue, 24 Sep 2024 17:06:19 -0700 Subject: [PATCH 01/11] feat: add justext --- src/datatrove/pipeline/extractors/__init__.py | 1 + src/datatrove/pipeline/extractors/justext.py | 88 +++++++++++++++++++ tests/pipeline/test_extractors.py | 9 +- tests/utils.py | 8 ++ 4 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 src/datatrove/pipeline/extractors/justext.py diff --git a/src/datatrove/pipeline/extractors/__init__.py b/src/datatrove/pipeline/extractors/__init__.py index 9620bd2f..262687a5 100644 --- a/src/datatrove/pipeline/extractors/__init__.py +++ b/src/datatrove/pipeline/extractors/__init__.py @@ -1,2 +1,3 @@ +from .justext import Justext from .modular import ReadabilityInscriptis from .trafilatura import Trafilatura diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py new file mode 100644 index 00000000..47bd49bc --- /dev/null +++ b/src/datatrove/pipeline/extractors/justext.py @@ -0,0 +1,88 @@ +from .base import BaseExtractor + + +class Justext(BaseExtractor): + """Justext extractor, it uses https://github.com/miso-belica/jusText + + We're actually only using the main entry point of justext: the `justext` function. + No specific data structure is exchanged with Justext, only the text is passed and the extracted text is returned. + Alternatively and identically, `justext` could be used through its command line main interface. + + Args: + length_low: the minimum length of a paragraph + length_high: the maximum length of a paragraph + stopwords_low: the minimum stopwords ratio of a paragraph + stopwords_high: the maximum stopwords ratio of a paragraph + max_link_density: the maximum link density of a paragraph + max_heading_distance: the maximum distance between headings of a paragraph + no_headings: whether to remove headings from the extracted text + remove_boilerplate: whether to remove boilerplate from the extracted text + kwargs: any other option will be passed to justext + timeout: the timeout for extraction, per document, in seconds + """ + + name = "⛏ Justext" + _requires_dependencies = ["justext"] + + def __init__( + self, + stoplist: list[str] = None, + length_low: int = 70, + length_high: int = 200, + stopwords_low: float = 0.3, + stopwords_high: float = 0.32, + max_link_density: float = 0.2, + max_heading_distance: int = 200, + no_headings: bool = False, + remove_boilerplate: bool = True, + timeout: float = 0.1, + **kwargs, + ): + super().__init__(timeout) + if stoplist is None: + stoplist = self.get_stoplist(lang="english") + self.stoplist = frozenset(stoplist) + self.length_low = length_low + self.length_high = length_high + self.stopwords_low = stopwords_low + self.stopwords_high = stopwords_high + self.max_link_density = max_link_density + self.max_heading_distance = max_heading_distance + self.no_headings = no_headings + self.remove_boilerplate = remove_boilerplate + self.kwargs = kwargs + + @staticmethod + def get_stoplist(lang: str = "english") -> list[str]: + from justext import get_stoplist + + return get_stoplist(lang) + + def extract(self, text: str) -> str: + """ + + Args: + text: str: html content + + Returns: plaintext extracted text + """ + from justext import justext + + paragraphs = justext( + text, + stoplist=self.stoplist, + length_low=self.length_low, + length_high=self.length_high, + stopwords_low=self.stopwords_low, + stopwords_high=self.stopwords_high, + max_link_density=self.max_link_density, + max_heading_distance=self.max_heading_distance, + no_headings=self.no_headings, + **self.kwargs, + ) + + # Join text blocks with double newlines to separate them + if self.remove_boilerplate: + return "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate]) + else: + return "\n\n".join([p.text for p in paragraphs]) diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py index ec7e1417..bdfe7b17 100644 --- a/tests/pipeline/test_extractors.py +++ b/tests/pipeline/test_extractors.py @@ -1,8 +1,8 @@ import unittest -from datatrove.pipeline.extractors import ReadabilityInscriptis, Trafilatura +from datatrove.pipeline.extractors import Justext, ReadabilityInscriptis, Trafilatura -from ..utils import require_inscriptis, require_readability, require_trafilatura +from ..utils import require_inscriptis, require_justext, require_readability, require_trafilatura ARTICLE_HTML = "

Hello World!

" @@ -19,3 +19,8 @@ def test_basic_article_trafilatura(self): def test_basic_article_readability(self): extractor = ReadabilityInscriptis(min_text_length=10, min_text_score=1) self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_justext + def test_basic_article_justext(self): + extractor = Justext(remove_boilerplate=False) + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") diff --git a/tests/utils.py b/tests/utils.py index 3d076308..ddae9d85 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -55,6 +55,14 @@ def require_trafilatura(test_case): return test_case +def require_justext(test_case): + try: + import justext # noqa: F401 + except ImportError: + test_case = unittest.skip("test requires justext")(test_case) + return test_case + + def require_readability(test_case): try: import readability # noqa: F401 From 9fbc6b23791a64b506931f637f0eaa174b945149 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Tue, 24 Sep 2024 17:13:42 -0700 Subject: [PATCH 02/11] fix: remove justext cli comment --- src/datatrove/pipeline/extractors/justext.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py index 47bd49bc..616e9e60 100644 --- a/src/datatrove/pipeline/extractors/justext.py +++ b/src/datatrove/pipeline/extractors/justext.py @@ -6,7 +6,6 @@ class Justext(BaseExtractor): We're actually only using the main entry point of justext: the `justext` function. No specific data structure is exchanged with Justext, only the text is passed and the extracted text is returned. - Alternatively and identically, `justext` could be used through its command line main interface. Args: length_low: the minimum length of a paragraph From a6cce5d2893bf88dfd20be2795cee00d28a3875c Mon Sep 17 00:00:00 2001 From: garrethlee Date: Tue, 24 Sep 2024 17:41:27 -0700 Subject: [PATCH 03/11] feat: add resiliparse --- src/datatrove/pipeline/extractors/__init__.py | 1 + .../pipeline/extractors/resiliparse.py | 74 +++++++++++++++++++ tests/pipeline/test_extractors.py | 9 ++- tests/utils.py | 8 ++ 4 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 src/datatrove/pipeline/extractors/resiliparse.py diff --git a/src/datatrove/pipeline/extractors/__init__.py b/src/datatrove/pipeline/extractors/__init__.py index 262687a5..72714e50 100644 --- a/src/datatrove/pipeline/extractors/__init__.py +++ b/src/datatrove/pipeline/extractors/__init__.py @@ -1,3 +1,4 @@ from .justext import Justext from .modular import ReadabilityInscriptis +from .resiliparse import Resiliparse from .trafilatura import Trafilatura diff --git a/src/datatrove/pipeline/extractors/resiliparse.py b/src/datatrove/pipeline/extractors/resiliparse.py new file mode 100644 index 00000000..74c2a997 --- /dev/null +++ b/src/datatrove/pipeline/extractors/resiliparse.py @@ -0,0 +1,74 @@ +from .base import BaseExtractor + + +class Resiliparse(BaseExtractor): + """ + Resiliparse extractor, it uses https://resiliparse.chatnoir.eu/en/latest/index.html + + We're actually only using the main entry point of resiliparse's text extraction: the `extract_plain_text` function. + No specific data structure is exchanged with Resiliparse, only the text is passed and the extracted text is returned. + + Args: + timeout: the timeout for extraction, per document, in seconds + preserve_formatting: whether to preserve the formatting of the text + main_content: whether to extract the main content of the document + list_bullets: whether to extract the bullets of the document + alt_texts: whether to extract the alt texts of the document + links: whether to extract the links of the document + form_fields: whether to extract the form fields of the document + noscript: whether to extract the noscript of the document + comments: whether to extract the comments that are present in the document + skip_elements: whether to skip the elements of the document + """ + + name = "⛏ Resiliparse" + _requires_dependencies = ["resiliparse"] + + def __init__( + self, + preserve_formatting: bool = True, + main_content: bool = True, + list_bullets: bool = True, + alt_texts: bool = False, + links: bool = False, + form_fields: bool = False, + noscript: bool = False, + comments: bool = True, + skip_elements: list = None, + timeout: float = 0.1, + **kwargs, + ): + super().__init__(timeout) + self.preserve_formatting = preserve_formatting + self.main_content = main_content + self.list_bullets = list_bullets + self.alt_texts = alt_texts + self.links = links + self.form_fields = form_fields + self.noscript = noscript + self.comments = comments + self.skip_elements = skip_elements + + def extract(self, text: str) -> str: + """ + + Args: + text: str: html content + + Returns: plaintext extracted text + + """ + from resiliparse.extract.html2text import extract_plain_text + + return extract_plain_text( + text, + preserve_formatting=self.preserve_formatting, + main_content=self.main_content, + list_bullets=self.list_bullets, + alt_texts=self.alt_texts, + links=self.links, + form_fields=self.form_fields, + noscript=self.noscript, + comments=self.comments, + skip_elements=self.skip_elements, + ) diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py index bdfe7b17..dbf4fb5b 100644 --- a/tests/pipeline/test_extractors.py +++ b/tests/pipeline/test_extractors.py @@ -1,8 +1,8 @@ import unittest -from datatrove.pipeline.extractors import Justext, ReadabilityInscriptis, Trafilatura +from datatrove.pipeline.extractors import Justext, ReadabilityInscriptis, Resiliparse, Trafilatura -from ..utils import require_inscriptis, require_justext, require_readability, require_trafilatura +from ..utils import require_inscriptis, require_justext, require_readability, require_resiliparse, require_trafilatura ARTICLE_HTML = "

Hello World!

" @@ -24,3 +24,8 @@ def test_basic_article_readability(self): def test_basic_article_justext(self): extractor = Justext(remove_boilerplate=False) self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_resiliparse + def test_basic_article_resiliparse(self): + extractor = Resiliparse() + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") diff --git a/tests/utils.py b/tests/utils.py index ddae9d85..07e63a22 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -71,6 +71,14 @@ def require_readability(test_case): return test_case +def require_resiliparse(test_case): + try: + import resiliparse # noqa: F401 + except ImportError: + test_case = unittest.skip("test requires resiliparse")(test_case) + return test_case + + def require_inscriptis(test_case): try: import inscriptis # noqa: F401 From add6807ce3e7764bfee2f3f10c0af435d8d1be85 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Thu, 26 Sep 2024 17:49:14 -0700 Subject: [PATCH 04/11] feat: add inscriptis --- .../pipeline/extractors/inscriptis.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/datatrove/pipeline/extractors/inscriptis.py diff --git a/src/datatrove/pipeline/extractors/inscriptis.py b/src/datatrove/pipeline/extractors/inscriptis.py new file mode 100644 index 00000000..721b308c --- /dev/null +++ b/src/datatrove/pipeline/extractors/inscriptis.py @@ -0,0 +1,63 @@ +import re + +from .base import BaseExtractor + + +class Inscriptis(BaseExtractor): + """Inscriptis extractor, it uses https://github.com/weblyzard/inscriptis + + We're using the main entry point of inscriptis: the `get_text` function. + No specific data structure is exchanged with Inscriptis, only the HTML is passed and the extracted text is returned. + + Args: + timeout: the timeout for extraction, per document, in seconds + deduplicate_captions: whether to remove duplicate captions + display_links: whether to display link targets + display_anchors: whether to display anchor texts + **kwargs: any other option will be passed to inscriptis + """ + + name = "⛏ Inscriptis" + _requires_dependencies = ["inscriptis"] + + def __init__( + self, + timeout: float = 0.1, + max_new_lines: int = 2, + deduplicate_captions: bool = True, + display_links: bool = False, + display_anchors: bool = True, + **kwargs, + ): + super().__init__(timeout) + self.new_line_chars = "\n" * max_new_lines + self.deduplicate_captions = deduplicate_captions + self.display_links = display_links + self.display_anchors = display_anchors + self.kwargs = kwargs + self.regex_excessive_lines = re.compile(r"(" + self.new_line_chars + "\n+)") + + def extract(self, text: str) -> str: + """ + Args: + text: str: html content + + Returns: plaintext extracted text + """ + from inscriptis import get_text + from inscriptis.css_profiles import CSS_PROFILES + from inscriptis.model.config import ParserConfig + + text = get_text( + html_content=text, + config=ParserConfig( + css=CSS_PROFILES["strict"], + deduplicate_captions=self.deduplicate_captions, + display_links=self.display_links, + display_anchors=self.display_anchors, + **self.kwargs, + ), + ) + + # remove excessive empty lines + return self.regex_excessive_lines.sub(self.new_line_chars, text).strip() From e3a728552490348cfb381a5553c73550720a7e86 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Thu, 26 Sep 2024 17:49:22 -0700 Subject: [PATCH 05/11] feat: add readabilipy --- .../pipeline/extractors/readabilipy.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 src/datatrove/pipeline/extractors/readabilipy.py diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py new file mode 100644 index 00000000..5b1c7861 --- /dev/null +++ b/src/datatrove/pipeline/extractors/readabilipy.py @@ -0,0 +1,57 @@ +from .base import BaseExtractor + + +class ReadabiliPy(BaseExtractor): + """ReadabiliPy extractor, it uses https://github.com/alan-turing-institute/ReadabiliPy + + We're using the main entry point of ReadabiliPy: the `simple_json_from_html_string` function. + The extracted content is returned as plain text. + + Args: + timeout: the timeout for extraction, per document, in seconds + use_readability: whether to use Mozilla's Readability.js (requires Node.js) + content_digests: whether to include content digests in the output + node_indexes: whether to include node indexes in the output + **kwargs: any other option will be passed to ReadabiliPy + """ + + name = "⛏ ReadabiliPy" + _requires_dependencies = ["readabilipy"] + + def __init__( + self, + timeout: float = 0.1, + use_readability: bool = False, + content_digests: bool = False, + node_indexes: bool = False, + **kwargs, + ): + super().__init__(timeout) + self.use_readability = use_readability + self.content_digests = content_digests + self.node_indexes = node_indexes + self.kwargs = kwargs + + def extract(self, text: str) -> str: + """ + Args: + text: str: html content + + Returns: plaintext extracted text + """ + from readabilipy import simple_json_from_html_string + + result = simple_json_from_html_string( + text, + use_readability=self.use_readability, + content_digests=self.content_digests, + node_indexes=self.node_indexes, + **self.kwargs, + ) + + content = result.get("plain_text", "") + + if isinstance(content, list): + content = "\n\n".join(block["text"] for block in content) + + return content From 2a6ef15509f309b0c04d63f4dbf7cacb56c6bea3 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Thu, 26 Sep 2024 17:49:32 -0700 Subject: [PATCH 06/11] feat: add readability --- .../pipeline/extractors/readability.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 src/datatrove/pipeline/extractors/readability.py diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py new file mode 100644 index 00000000..7ff29381 --- /dev/null +++ b/src/datatrove/pipeline/extractors/readability.py @@ -0,0 +1,58 @@ +from .base import BaseExtractor + + +class Readability(BaseExtractor): + """Readability extractor, it uses https://github.com/buriy/python-readability + + We're using the main entry point of readability-lxml: the `Document` class. + No specific data structure is exchanged with Readability, only the HTML is passed and the extracted text is returned. + + Args: + timeout: the timeout for extraction, per document, in seconds + min_text_length: the minimum length of text to consider + retry_length: number of chars to use when searching for body + url: the URL of the page (optional, used for better parsing) + keep_classes: list of classes to keep in the extracted content + **kwargs: any other option will be passed to readability + """ + + name = "⛏ Readability" + _requires_dependencies = ["readability"] + + def __init__( + self, + timeout: float = 0.1, + min_text_length: int = 25, + retry_length: int = 250, + url: str = None, + **kwargs, + ): + super().__init__(timeout) + self.min_text_length = min_text_length + self.retry_length = retry_length + self.url = url + self.kwargs = kwargs + + def extract(self, text: str, postprocessor: BaseExtractor) -> str: + """ + Args: + text: str: html content + + Returns: plaintext extracted text + """ + from readability import Document + + if not postprocessor: + raise ValueError("A postprocessor (extractor) must be provided") + + doc = Document( + text, + min_text_length=self.min_text_length, + retry_length=self.retry_length, + url=self.url, + **self.kwargs, + ) + + cleaned_html = doc.summary() + + return postprocessor.extract(cleaned_html) From 84f1ed4cf13206dc8c9b8e8acff2b5f79dfeb387 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Thu, 26 Sep 2024 17:49:52 -0700 Subject: [PATCH 07/11] feat: add require_readability to utils --- tests/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utils.py b/tests/utils.py index 07e63a22..4d00755a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -87,6 +87,14 @@ def require_inscriptis(test_case): return test_case +def require_readabilipy(test_case): + try: + import readabilipy # noqa: F401 + except ImportError: + test_case = unittest.skip("test requires readabilipy")(test_case) + return test_case + + def require_pyarrow(test_case): try: import pyarrow # noqa: F401 From c085736e8bae20b92f7475bd76292ac51f128b37 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Thu, 26 Sep 2024 17:50:04 -0700 Subject: [PATCH 08/11] feat: add tests --- tests/pipeline/test_extractors.py | 39 +++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py index dbf4fb5b..b306e1e8 100644 --- a/tests/pipeline/test_extractors.py +++ b/tests/pipeline/test_extractors.py @@ -1,8 +1,23 @@ import unittest -from datatrove.pipeline.extractors import Justext, ReadabilityInscriptis, Resiliparse, Trafilatura - -from ..utils import require_inscriptis, require_justext, require_readability, require_resiliparse, require_trafilatura +from datatrove.pipeline.extractors import ( + Inscriptis, + Justext, + ReadabiliPy, + Readability, + ReadabilityInscriptis, + Resiliparse, + Trafilatura, +) + +from ..utils import ( + require_inscriptis, + require_justext, + require_readabilipy, + require_readability, + require_resiliparse, + require_trafilatura, +) ARTICLE_HTML = "

Hello World!

" @@ -16,7 +31,7 @@ def test_basic_article_trafilatura(self): @require_readability @require_inscriptis - def test_basic_article_readability(self): + def test_basic_article_readability_inscriptis(self): extractor = ReadabilityInscriptis(min_text_length=10, min_text_score=1) self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") @@ -29,3 +44,19 @@ def test_basic_article_justext(self): def test_basic_article_resiliparse(self): extractor = Resiliparse() self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_readabilipy + def test_basic_article_readabilipy(self): + extractor = ReadabiliPy() + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_inscriptis + def test_basic_article_inscriptis(self): + extractor = Inscriptis() + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_readability + def test_basic_article_readability(self): + extractor = Readability(min_text_length=10, min_text_score=1) + postprocessor = Trafilatura() + self.assertEqual(extractor.extract(ARTICLE_HTML, postprocessor=postprocessor), "Hello World!") From ea3a9154cf4b40cc48b7ff7d9dcd46b12531bc12 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Thu, 26 Sep 2024 17:50:17 -0700 Subject: [PATCH 09/11] feat: changed configs & pyproject --- pyproject.toml | 3 +++ src/datatrove/pipeline/extractors/__init__.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index cf226903..b29025ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,9 @@ processing = [ # "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", "tldextract", "trafilatura>=1.8.0,<1.12.0", + "justext", + "resiliparse", + "readabilipy", "tokenizers", "ftfy", "fasteners", diff --git a/src/datatrove/pipeline/extractors/__init__.py b/src/datatrove/pipeline/extractors/__init__.py index 72714e50..c4ef7a89 100644 --- a/src/datatrove/pipeline/extractors/__init__.py +++ b/src/datatrove/pipeline/extractors/__init__.py @@ -1,4 +1,7 @@ +from .inscriptis import Inscriptis from .justext import Justext from .modular import ReadabilityInscriptis +from .readabilipy import ReadabiliPy +from .readability import Readability from .resiliparse import Resiliparse from .trafilatura import Trafilatura From 891850e38d705fcc58ea25dab4895376d837c0d9 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Thu, 26 Sep 2024 19:55:38 -0700 Subject: [PATCH 10/11] fix: move postprocessor to init --- src/datatrove/pipeline/extractors/readability.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py index 7ff29381..f993220f 100644 --- a/src/datatrove/pipeline/extractors/readability.py +++ b/src/datatrove/pipeline/extractors/readability.py @@ -4,8 +4,10 @@ class Readability(BaseExtractor): """Readability extractor, it uses https://github.com/buriy/python-readability - We're using the main entry point of readability-lxml: the `Document` class. - No specific data structure is exchanged with Readability, only the HTML is passed and the extracted text is returned. + We're using the main entry point of readability-lxml: the `Document` class, which cleans up the HTML and outputs a + cleaned HTML string. + + The postprocessor (another Datatrove extractor) is used to convert the cleaned HTML to plain text Args: timeout: the timeout for extraction, per document, in seconds @@ -21,6 +23,7 @@ class Readability(BaseExtractor): def __init__( self, + postprocessor: BaseExtractor, timeout: float = 0.1, min_text_length: int = 25, retry_length: int = 250, @@ -28,12 +31,13 @@ def __init__( **kwargs, ): super().__init__(timeout) + self.postprocessor = postprocessor self.min_text_length = min_text_length self.retry_length = retry_length self.url = url self.kwargs = kwargs - def extract(self, text: str, postprocessor: BaseExtractor) -> str: + def extract(self, text: str) -> str: """ Args: text: str: html content @@ -42,9 +46,6 @@ def extract(self, text: str, postprocessor: BaseExtractor) -> str: """ from readability import Document - if not postprocessor: - raise ValueError("A postprocessor (extractor) must be provided") - doc = Document( text, min_text_length=self.min_text_length, @@ -55,4 +56,4 @@ def extract(self, text: str, postprocessor: BaseExtractor) -> str: cleaned_html = doc.summary() - return postprocessor.extract(cleaned_html) + return self.postprocessor.extract(cleaned_html) From 76816c8a0bb6c153bb906f5f0af3bf22b2956566 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Wed, 11 Dec 2024 04:15:04 +0000 Subject: [PATCH 11/11] feat: implement clean_html method in extractors and update Inscriptis initialization - Added a default `clean_html` method to the `BaseExtractor` class, providing a warning for extractors that do not implement their own. - Implemented specific `clean_html` methods in `Inscriptis`, `Justext`, `ReadabiliPy`, `Readability`, and `Trafilatura` extractors to handle HTML cleaning. - Updated the `Inscriptis` extractor to accept a preprocessor during initialization. - Modified the `extract` methods in `ReadabiliPy` and `Readability` to utilize the new `clean_html` method. - Adjusted the `Justext` extractor to remove the default English language parameter from `get_stoplist`. - Updated tests to reflect changes in extractor initialization and functionality. --- pyproject.toml | 3 +- src/datatrove/pipeline/extractors/base.py | 15 ++++++++++ .../pipeline/extractors/inscriptis.py | 6 +++- src/datatrove/pipeline/extractors/justext.py | 20 +++++++++++-- .../pipeline/extractors/readabilipy.py | 27 +++++++++++------ .../pipeline/extractors/readability.py | 20 ++++++++----- .../pipeline/extractors/trafilatura.py | 15 ++++++++++ src/datatrove/pipeline/tokens/tokenizer.py | 29 ++++++++++++++++++- tests/pipeline/test_extractors.py | 7 +++-- 9 files changed, 117 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b29025ac..270b82cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,9 +50,10 @@ s3 = [ ] processing = [ "fasttext-wheel", + "brotlipy", "nltk", "inscriptis", -# "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", + "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", "tldextract", "trafilatura>=1.8.0,<1.12.0", "justext", diff --git a/src/datatrove/pipeline/extractors/base.py b/src/datatrove/pipeline/extractors/base.py index d3622d79..d83eba21 100644 --- a/src/datatrove/pipeline/extractors/base.py +++ b/src/datatrove/pipeline/extractors/base.py @@ -34,6 +34,21 @@ def extract(self, text: str) -> str: """ pass + def clean_html(self, html: str) -> str: + """Default implementation of `clean_html` for extractors that don't return a cleaned version of the HTML + + Since not all extractors produce a cleaned version of the HTML as a part of the extraction process, + this default implementation throws a warning and simply returns the original HTML string. + + Args: + html: str: the HTML content to clean + + Returns: + str: the cleaned HTML + """ + logger.warning(f"{self.name} doesn't have a clean_html() method by default. Skipping...") + return html + def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> DocumentsPipeline: """Iterates through each document in data and calls `timeout_extract` on it. diff --git a/src/datatrove/pipeline/extractors/inscriptis.py b/src/datatrove/pipeline/extractors/inscriptis.py index 721b308c..0e3e2c6f 100644 --- a/src/datatrove/pipeline/extractors/inscriptis.py +++ b/src/datatrove/pipeline/extractors/inscriptis.py @@ -22,6 +22,7 @@ class Inscriptis(BaseExtractor): def __init__( self, + preprocessor: BaseExtractor, timeout: float = 0.1, max_new_lines: int = 2, deduplicate_captions: bool = True, @@ -30,6 +31,7 @@ def __init__( **kwargs, ): super().__init__(timeout) + self.preprocessor = preprocessor self.new_line_chars = "\n" * max_new_lines self.deduplicate_captions = deduplicate_captions self.display_links = display_links @@ -48,8 +50,10 @@ def extract(self, text: str) -> str: from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig + cleaned_html = self.preprocessor.clean_html(text) + text = get_text( - html_content=text, + html_content=cleaned_html, config=ParserConfig( css=CSS_PROFILES["strict"], deduplicate_captions=self.deduplicate_captions, diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py index 616e9e60..05e71914 100644 --- a/src/datatrove/pipeline/extractors/justext.py +++ b/src/datatrove/pipeline/extractors/justext.py @@ -39,7 +39,7 @@ def __init__( ): super().__init__(timeout) if stoplist is None: - stoplist = self.get_stoplist(lang="english") + stoplist = self.get_stoplist() self.stoplist = frozenset(stoplist) self.length_low = length_low self.length_high = length_high @@ -52,10 +52,26 @@ def __init__( self.kwargs = kwargs @staticmethod - def get_stoplist(lang: str = "english") -> list[str]: + def get_stoplist(lang: str = "English") -> list[str]: from justext import get_stoplist return get_stoplist(lang) + + def clean_html(self, html: str) -> str: + """ + + Args: + html: str: html content + + Returns: cleaned HTML + """ + from justext.core import html_to_dom, preprocessor + from lxml.html import tostring + + dom = html_to_dom(html) + dom = preprocessor(dom) + cleaned_html = tostring(dom).decode() + return cleaned_html def extract(self, text: str) -> str: """ diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py index 5b1c7861..c5e345b7 100644 --- a/src/datatrove/pipeline/extractors/readabilipy.py +++ b/src/datatrove/pipeline/extractors/readabilipy.py @@ -32,6 +32,20 @@ def __init__( self.node_indexes = node_indexes self.kwargs = kwargs + def clean_html(self, html: str) -> str: + """ + + Args: + html: str: html content + + Returns: cleaned HTML + """ + from readabilipy import simple_tree_from_html_string + + result = simple_tree_from_html_string(html) + return str(result) + + def extract(self, text: str) -> str: """ Args: @@ -39,17 +53,12 @@ def extract(self, text: str) -> str: Returns: plaintext extracted text """ - from readabilipy import simple_json_from_html_string + from readabilipy.simple_json import plain_content, extract_text_blocks_as_plain_text - result = simple_json_from_html_string( - text, - use_readability=self.use_readability, - content_digests=self.content_digests, - node_indexes=self.node_indexes, - **self.kwargs, - ) + cleaned_html = self.clean_html(text) - content = result.get("plain_text", "") + pl_content = plain_content(cleaned_html, self.content_digests, self.node_indexes) + content = extract_text_blocks_as_plain_text(pl_content) if isinstance(content, list): content = "\n\n".join(block["text"] for block in content) diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py index f993220f..d528a586 100644 --- a/src/datatrove/pipeline/extractors/readability.py +++ b/src/datatrove/pipeline/extractors/readability.py @@ -37,13 +37,7 @@ def __init__( self.url = url self.kwargs = kwargs - def extract(self, text: str) -> str: - """ - Args: - text: str: html content - - Returns: plaintext extracted text - """ + def clean_html(self, text: str) -> str: from readability import Document doc = Document( @@ -54,6 +48,16 @@ def extract(self, text: str) -> str: **self.kwargs, ) - cleaned_html = doc.summary() + return doc.summary() + + + def extract(self, text: str) -> str: + """ + Args: + text: str: html content + + Returns: plaintext extracted text + """ + cleaned_html = self.clean_html(text) return self.postprocessor.extract(cleaned_html) diff --git a/src/datatrove/pipeline/extractors/trafilatura.py b/src/datatrove/pipeline/extractors/trafilatura.py index dbebd62c..d02cecd1 100644 --- a/src/datatrove/pipeline/extractors/trafilatura.py +++ b/src/datatrove/pipeline/extractors/trafilatura.py @@ -34,6 +34,21 @@ def __init__( self.kwargs = kwargs if self.include_images: raise NotImplementedError + + def clean_html(self, html: str) -> str: + """ + + Args: + html: str: html content + + Returns: cleaned HTML + """ + from trafilatura import bare_extraction + from xml.etree import ElementTree + + html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)['body'] + cleaned_html = ElementTree.tostring(html_body, encoding = "unicode") + return cleaned_html def extract(self, text: str) -> str: """ diff --git a/src/datatrove/pipeline/tokens/tokenizer.py b/src/datatrove/pipeline/tokens/tokenizer.py index d6040a04..3ae80080 100644 --- a/src/datatrove/pipeline/tokens/tokenizer.py +++ b/src/datatrove/pipeline/tokens/tokenizer.py @@ -283,6 +283,7 @@ def __init__( save_filename: str = None, # if defined, the final output filename will be this tokenizer_name_or_path: str = "gpt2", # tokenizer to use, from HF or a local eos_token: str = "<|endoftext|>", # whether to add the EOS token after each document + r2l_digits: bool = False, # whether to tokenize digits from right to left save_loss_metadata: bool = False, # save the loss information shuffle: bool = True, # whether to shuffle documents in the dataset, batch_size: int = 10000, # batch size for tokenization @@ -305,6 +306,7 @@ def __init__( self.save_filename = save_filename self.tokenizer_name_or_path = tokenizer_name_or_path self.eos_token = eos_token + self.r2l_digits = r2l_digits self.save_loss_metadata = save_loss_metadata self.shuffle = shuffle self.batch_size = batch_size @@ -343,8 +345,29 @@ def write_unshuffled(self, data: DocumentsPipeline, filename: str): data (DocumentsPipeline): the documents to process filename (str): the filename to use for the output file """ + import re + from tokenizers import Encoding + R2L_COMMA_TOKEN = "" + + self.tokenizer.add_special_tokens([R2L_COMMA_TOKEN]) + R2L_COMMA_TOKEN_ID = self.tokenizer.token_to_id(R2L_COMMA_TOKEN) + + def add_commas(match): + import sys + + sys.set_int_max_str_digits(1000000) + + num, decimal = match.groups() + decimal = decimal if decimal else "" + return f"{int(num):,}{decimal}".replace(",", R2L_COMMA_TOKEN) + + def preprocess_text(text: str): + number_regex = re.compile(r"([\d]+)(\.\d+)?") + processed_text = number_regex.sub(add_commas, text) + return processed_text + unshuff = TokenizedFile( self.output_folder if not self.shuffle or not self.local_working_dir else self.local_working_dir, filename, @@ -358,9 +381,13 @@ def write_unshuffled(self, data: DocumentsPipeline, filename: str): # tokenize document's text in batches to go faster – we compute loss values independently if needed for batch in batched(data, self.batch_size): with self.track_time(unit="batch"): - encoded_batch: list[Encoding] = self.tokenizer.encode_batch([document.text for document in batch]) + encoded_batch: list[Encoding] = self.tokenizer.encode_batch( + [preprocess_text(document.text) if self.r2l_digits else document.text for document in batch] + ) for document, encoded in zip(batch, encoded_batch): tokens = encoded.ids + if self.r2l_digits: + tokens = [tok for tok in tokens if tok != R2L_COMMA_TOKEN_ID] loss_values = self.get_loss_values(document, encoded) if loss_values is not None and len(loss_values) < len(tokens): # crop final section without loss diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py index b306e1e8..de9e76dd 100644 --- a/tests/pipeline/test_extractors.py +++ b/tests/pipeline/test_extractors.py @@ -52,11 +52,12 @@ def test_basic_article_readabilipy(self): @require_inscriptis def test_basic_article_inscriptis(self): - extractor = Inscriptis() + preprocessor = Trafilatura() + extractor = Inscriptis(preprocessor=preprocessor) self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") @require_readability def test_basic_article_readability(self): - extractor = Readability(min_text_length=10, min_text_score=1) postprocessor = Trafilatura() - self.assertEqual(extractor.extract(ARTICLE_HTML, postprocessor=postprocessor), "Hello World!") + extractor = Readability(postprocessor=postprocessor, min_text_length=10, min_text_score=1) + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!")