diff --git a/src/datatrove/pipeline/extractors/base.py b/src/datatrove/pipeline/extractors/base.py index d83eba21..b7fc7dc8 100644 --- a/src/datatrove/pipeline/extractors/base.py +++ b/src/datatrove/pipeline/extractors/base.py @@ -46,7 +46,6 @@ def clean_html(self, html: str) -> str: Returns: str: the cleaned HTML """ - logger.warning(f"{self.name} doesn't have a clean_html() method by default. Skipping...") return html def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> DocumentsPipeline: @@ -68,10 +67,12 @@ def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> Do try: doc.text = future.result(timeout=self.timeout) except TimeoutError: - logger.warning("⏰ Timeout while cleaning record text. Skipping record.") + logger.warning( + "⏰ Timeout while cleaning record text. Skipping record.") continue except Exception as e: - logger.warning(f'❌ Error "{e}" while cleaning record text. Skipping record.') + logger.warning( + f'❌ Error "{e}" while cleaning record text. Skipping record.') continue if doc.text: self.stat_update(StatHints.forwarded) diff --git a/src/datatrove/pipeline/extractors/resiliparse.py b/src/datatrove/pipeline/extractors/resiliparse.py index 74c2a997..574d44de 100644 --- a/src/datatrove/pipeline/extractors/resiliparse.py +++ b/src/datatrove/pipeline/extractors/resiliparse.py @@ -1,4 +1,5 @@ from .base import BaseExtractor +from datatrove.utils.logging import logger class Resiliparse(BaseExtractor): @@ -48,6 +49,9 @@ def __init__( self.noscript = noscript self.comments = comments self.skip_elements = skip_elements + logger.warning( + f"{self.name} doesn't have a clean_html() method by default, so the original HTML will be used as is." + ) def extract(self, text: str) -> str: """