Skip to content

Commit

Permalink
refactor: move warning log to constructor to avoid ballooning log fil…
Browse files Browse the repository at this point in the history
…e sizes
  • Loading branch information
garrethlee committed Dec 18, 2024
1 parent d0f6ead commit 26bf413
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
7 changes: 4 additions & 3 deletions src/datatrove/pipeline/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def clean_html(self, html: str) -> str:
Returns:
str: the cleaned HTML
"""
logger.warning(f"{self.name} doesn't have a clean_html() method by default. Skipping...")
return html

def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> DocumentsPipeline:
Expand All @@ -68,10 +67,12 @@ def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> Do
try:
doc.text = future.result(timeout=self.timeout)
except TimeoutError:
logger.warning("⏰ Timeout while cleaning record text. Skipping record.")
logger.warning(
"⏰ Timeout while cleaning record text. Skipping record.")
continue
except Exception as e:
logger.warning(f'❌ Error "{e}" while cleaning record text. Skipping record.')
logger.warning(
f'❌ Error "{e}" while cleaning record text. Skipping record.')
continue
if doc.text:
self.stat_update(StatHints.forwarded)
Expand Down
4 changes: 4 additions & 0 deletions src/datatrove/pipeline/extractors/resiliparse.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .base import BaseExtractor
from datatrove.utils.logging import logger


class Resiliparse(BaseExtractor):
Expand Down Expand Up @@ -48,6 +49,9 @@ def __init__(
self.noscript = noscript
self.comments = comments
self.skip_elements = skip_elements
logger.warning(
f"{self.name} doesn't have a clean_html() method by default, so the original HTML will be used as is."
)

def extract(self, text: str) -> str:
"""
Expand Down

0 comments on commit 26bf413

Please sign in to comment.