diff --git a/src/datatrove/pipeline/extractors/base.py b/src/datatrove/pipeline/extractors/base.py index b7fc7dc8..2b58a6ba 100644 --- a/src/datatrove/pipeline/extractors/base.py +++ b/src/datatrove/pipeline/extractors/base.py @@ -67,12 +67,10 @@ def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> Do try: doc.text = future.result(timeout=self.timeout) except TimeoutError: - logger.warning( - "⏰ Timeout while cleaning record text. Skipping record.") + logger.warning("⏰ Timeout while cleaning record text. Skipping record.") continue except Exception as e: - logger.warning( - f'❌ Error "{e}" while cleaning record text. Skipping record.') + logger.warning(f'❌ Error "{e}" while cleaning record text. Skipping record.') continue if doc.text: self.stat_update(StatHints.forwarded) diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py index 8a0f32df..643c135c 100644 --- a/src/datatrove/pipeline/extractors/readabilipy.py +++ b/src/datatrove/pipeline/extractors/readabilipy.py @@ -45,7 +45,6 @@ def clean_html(self, html: str) -> str: result = simple_tree_from_html_string(html) return str(result) - def extract(self, text: str) -> str: """ Args: diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py index d528a586..10556e11 100644 --- a/src/datatrove/pipeline/extractors/readability.py +++ b/src/datatrove/pipeline/extractors/readability.py @@ -50,7 +50,6 @@ def clean_html(self, text: str) -> str: return doc.summary() - def extract(self, text: str) -> str: """ Args: diff --git a/src/datatrove/pipeline/extractors/trafilatura.py b/src/datatrove/pipeline/extractors/trafilatura.py index 12cf8efd..9f87428f 100644 --- a/src/datatrove/pipeline/extractors/trafilatura.py +++ b/src/datatrove/pipeline/extractors/trafilatura.py @@ -47,8 +47,8 @@ def clean_html(self, html: str) -> str: from trafilatura import bare_extraction - html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)['body'] - cleaned_html = ElementTree.tostring(html_body, encoding = "unicode") + html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)["body"] + cleaned_html = ElementTree.tostring(html_body, encoding="unicode") return cleaned_html def extract(self, text: str) -> str: