Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add missing stream mime type assignment to the LinkContentFetcher #8596

Merged
merged 2 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions haystack/components/fetchers/link_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def run(self, urls: List[str]):
if len(urls) == 1:
stream_metadata, stream = self._fetch(urls[0])
stream.meta.update(stream_metadata)
stream.mime_type = stream.meta.get("content_type", None)
streams.append(stream)
else:
with ThreadPoolExecutor() as executor:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
fixes:
- |
Add missing stream mime type assignment to the `LinkContentFetcher` for
the single url scenario.

Previously the pipelines that use `FileTypeRouter` could fail if receive
a single url as an input.
13 changes: 13 additions & 0 deletions test/components/fetchers/test_link_content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def test_run_text(self):
first_stream = streams[0]
assert first_stream.data == correct_response
assert first_stream.meta["content_type"] == "text/plain"
assert first_stream.mime_type == "text/plain"

def test_run_html(self):
correct_response = b"<h1>Example test response</h1>"
Expand All @@ -86,6 +87,7 @@ def test_run_html(self):
first_stream = streams[0]
assert first_stream.data == correct_response
assert first_stream.meta["content_type"] == "text/html"
assert first_stream.mime_type == "text/html"

def test_run_binary(self, test_files_path):
file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
Expand All @@ -98,6 +100,7 @@ def test_run_binary(self, test_files_path):
first_stream = streams[0]
assert first_stream.data == file_bytes
assert first_stream.meta["content_type"] == "application/pdf"
assert first_stream.mime_type == "application/pdf"

def test_run_bad_status_code(self):
empty_byte_stream = b""
Expand All @@ -112,6 +115,7 @@ def test_run_bad_status_code(self):
first_stream = streams[0]
assert first_stream.data == empty_byte_stream
assert first_stream.meta["content_type"] == "text/html"
assert first_stream.mime_type == "text/html"

@pytest.mark.integration
def test_link_content_fetcher_html(self):
Expand All @@ -121,6 +125,7 @@ def test_link_content_fetcher_html(self):
assert "Haystack" in first_stream.data.decode("utf-8")
assert first_stream.meta["content_type"] == "text/html"
assert "url" in first_stream.meta and first_stream.meta["url"] == HTML_URL
assert first_stream.mime_type == "text/html"

@pytest.mark.integration
def test_link_content_fetcher_text(self):
Expand All @@ -130,6 +135,7 @@ def test_link_content_fetcher_text(self):
assert "Haystack" in first_stream.data.decode("utf-8")
assert first_stream.meta["content_type"] == "text/plain"
assert "url" in first_stream.meta and first_stream.meta["url"] == TEXT_URL
assert first_stream.mime_type == "text/plain"

@pytest.mark.integration
def test_link_content_fetcher_pdf(self):
Expand All @@ -139,6 +145,7 @@ def test_link_content_fetcher_pdf(self):
first_stream = streams[0]
assert first_stream.meta["content_type"] in ("application/octet-stream", "application/pdf")
assert "url" in first_stream.meta and first_stream.meta["url"] == PDF_URL
assert first_stream.mime_type in ("application/octet-stream", "application/pdf")

@pytest.mark.integration
def test_link_content_fetcher_multiple_different_content_types(self):
Expand All @@ -152,8 +159,10 @@ def test_link_content_fetcher_multiple_different_content_types(self):
assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
if stream.meta["content_type"] == "text/html":
assert "Haystack" in stream.data.decode("utf-8")
assert stream.mime_type == "text/html"
elif stream.meta["content_type"] == "application/pdf":
assert len(stream.data) > 0
assert stream.mime_type == "application/pdf"

@pytest.mark.integration
def test_link_content_fetcher_multiple_html_streams(self):
Expand All @@ -169,8 +178,10 @@ def test_link_content_fetcher_multiple_html_streams(self):
assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
if stream.meta["content_type"] == "text/html":
assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8")
assert stream.mime_type == "text/html"
elif stream.meta["content_type"] == "application/pdf":
assert len(stream.data) > 0
assert stream.mime_type == "application/pdf"

@pytest.mark.integration
def test_mix_of_good_and_failed_requests(self):
Expand All @@ -184,6 +195,7 @@ def test_mix_of_good_and_failed_requests(self):
assert len(result["streams"]) == 1
first_stream = result["streams"][0]
assert first_stream.meta["content_type"] == "text/html"
assert first_stream.mime_type == "text/html"

@pytest.mark.integration
def test_bad_request_exception_raised(self):
Expand All @@ -201,4 +213,5 @@ def test_link_content_fetcher_audio(self):
streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"]
first_stream = streams[0]
assert first_stream.meta["content_type"] == "audio/mpeg"
assert first_stream.mime_type == "audio/mpeg"
assert len(first_stream.data) > 0
Loading