Skip to content

Commit

Permalink
style: fixed ruff format errors
Browse files Browse the repository at this point in the history
  • Loading branch information
garrethlee committed Dec 21, 2024
1 parent c9f1c2b commit 2ac81d5
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 8 deletions.
6 changes: 2 additions & 4 deletions src/datatrove/pipeline/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,10 @@ def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> Do
try:
doc.text = future.result(timeout=self.timeout)
except TimeoutError:
logger.warning(
"⏰ Timeout while cleaning record text. Skipping record.")
logger.warning("⏰ Timeout while cleaning record text. Skipping record.")
continue
except Exception as e:
logger.warning(
f'❌ Error "{e}" while cleaning record text. Skipping record.')
logger.warning(f'❌ Error "{e}" while cleaning record text. Skipping record.')
continue
if doc.text:
self.stat_update(StatHints.forwarded)
Expand Down
1 change: 0 additions & 1 deletion src/datatrove/pipeline/extractors/readabilipy.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def clean_html(self, html: str) -> str:
result = simple_tree_from_html_string(html)
return str(result)


def extract(self, text: str) -> str:
"""
Args:
Expand Down
1 change: 0 additions & 1 deletion src/datatrove/pipeline/extractors/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def clean_html(self, text: str) -> str:

return doc.summary()


def extract(self, text: str) -> str:
"""
Args:
Expand Down
4 changes: 2 additions & 2 deletions src/datatrove/pipeline/extractors/trafilatura.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def clean_html(self, html: str) -> str:

from trafilatura import bare_extraction

html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)['body']
cleaned_html = ElementTree.tostring(html_body, encoding = "unicode")
html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)["body"]
cleaned_html = ElementTree.tostring(html_body, encoding="unicode")
return cleaned_html

def extract(self, text: str) -> str:
Expand Down

0 comments on commit 2ac81d5

Please sign in to comment.