Skip to content

Commit

Permalink
fix xhtml+xml
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Dec 11, 2024
1 parent 842b241 commit b701935
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/datatrove/pipeline/readers/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,21 @@ def process_record(record: "ArcWarcRecord") -> dict | None:
# content type filtering
mime_type = record.rec_headers.get("WARC-Identified-Payload-Type", None)
if mime_type is not None and (
mime_type != "text/html" and (record.rec_type != "conversion" or mime_type != "text/plain")
mime_type != "text/html"
and mime_type != "application/xhtml+xml"
and (record.rec_type != "conversion" or mime_type != "text/plain")
):
return

content_bytes = record.content_stream().read()
if mime_type is None:
# fallback for older crawls without payload types
mime_type = magic.from_buffer(content_bytes, mime=True)
if mime_type != "text/html" and (record.rec_type != "conversion" or mime_type != "text/plain"):
if (
mime_type != "text/html"
and mime_type != "application/xhtml+xml"
and (record.rec_type != "conversion" or mime_type != "text/plain")
):
return

# Decode the response bytes
Expand Down

0 comments on commit b701935

Please sign in to comment.