From 6f983a22ca4bfb58c066e05ac741afc6a3207d8b Mon Sep 17 00:00:00 2001 From: Anton Pelykh Date: Mon, 9 Dec 2024 15:51:14 +0100 Subject: [PATCH] fix: add missing stream mime type assignment to the `LinkContentFetcher` (#8596) * add missing stream mime type assignment to the `LinkContentFetcher` * fix release note fmt --------- Co-authored-by: Stefano Fiorucci --- haystack/components/fetchers/link_content.py | 1 + ...o-the-link-content-fetcher-7581e2728e2130b8.yaml | 8 ++++++++ .../fetchers/test_link_content_fetcher.py | 13 +++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 releasenotes/notes/add-missing-mime-type-assignment-to-the-link-content-fetcher-7581e2728e2130b8.yaml diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index 38712d683f..fb85d11485 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -151,6 +151,7 @@ def run(self, urls: List[str]): if len(urls) == 1: stream_metadata, stream = self._fetch(urls[0]) stream.meta.update(stream_metadata) + stream.mime_type = stream.meta.get("content_type", None) streams.append(stream) else: with ThreadPoolExecutor() as executor: diff --git a/releasenotes/notes/add-missing-mime-type-assignment-to-the-link-content-fetcher-7581e2728e2130b8.yaml b/releasenotes/notes/add-missing-mime-type-assignment-to-the-link-content-fetcher-7581e2728e2130b8.yaml new file mode 100644 index 0000000000..6067fa3140 --- /dev/null +++ b/releasenotes/notes/add-missing-mime-type-assignment-to-the-link-content-fetcher-7581e2728e2130b8.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Add missing stream mime type assignment to the `LinkContentFetcher` for + the single url scenario. + + Previously the pipelines that use `FileTypeRouter` could fail if receive + a single url as an input. diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py index 35cbd5e40c..6c2b5e4bc3 100644 --- a/test/components/fetchers/test_link_content_fetcher.py +++ b/test/components/fetchers/test_link_content_fetcher.py @@ -74,6 +74,7 @@ def test_run_text(self): first_stream = streams[0] assert first_stream.data == correct_response assert first_stream.meta["content_type"] == "text/plain" + assert first_stream.mime_type == "text/plain" def test_run_html(self): correct_response = b"

Example test response

" @@ -86,6 +87,7 @@ def test_run_html(self): first_stream = streams[0] assert first_stream.data == correct_response assert first_stream.meta["content_type"] == "text/html" + assert first_stream.mime_type == "text/html" def test_run_binary(self, test_files_path): file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read() @@ -98,6 +100,7 @@ def test_run_binary(self, test_files_path): first_stream = streams[0] assert first_stream.data == file_bytes assert first_stream.meta["content_type"] == "application/pdf" + assert first_stream.mime_type == "application/pdf" def test_run_bad_status_code(self): empty_byte_stream = b"" @@ -112,6 +115,7 @@ def test_run_bad_status_code(self): first_stream = streams[0] assert first_stream.data == empty_byte_stream assert first_stream.meta["content_type"] == "text/html" + assert first_stream.mime_type == "text/html" @pytest.mark.integration def test_link_content_fetcher_html(self): @@ -121,6 +125,7 @@ def test_link_content_fetcher_html(self): assert "Haystack" in first_stream.data.decode("utf-8") assert first_stream.meta["content_type"] == "text/html" assert "url" in first_stream.meta and first_stream.meta["url"] == HTML_URL + assert first_stream.mime_type == "text/html" @pytest.mark.integration def test_link_content_fetcher_text(self): @@ -130,6 +135,7 @@ def test_link_content_fetcher_text(self): assert "Haystack" in first_stream.data.decode("utf-8") assert first_stream.meta["content_type"] == "text/plain" assert "url" in first_stream.meta and first_stream.meta["url"] == TEXT_URL + assert first_stream.mime_type == "text/plain" @pytest.mark.integration def test_link_content_fetcher_pdf(self): @@ -139,6 +145,7 @@ def test_link_content_fetcher_pdf(self): first_stream = streams[0] assert first_stream.meta["content_type"] in ("application/octet-stream", "application/pdf") assert "url" in first_stream.meta and first_stream.meta["url"] == PDF_URL + assert first_stream.mime_type in ("application/octet-stream", "application/pdf") @pytest.mark.integration def test_link_content_fetcher_multiple_different_content_types(self): @@ -152,8 +159,10 @@ def test_link_content_fetcher_multiple_different_content_types(self): assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream") if stream.meta["content_type"] == "text/html": assert "Haystack" in stream.data.decode("utf-8") + assert stream.mime_type == "text/html" elif stream.meta["content_type"] == "application/pdf": assert len(stream.data) > 0 + assert stream.mime_type == "application/pdf" @pytest.mark.integration def test_link_content_fetcher_multiple_html_streams(self): @@ -169,8 +178,10 @@ def test_link_content_fetcher_multiple_html_streams(self): assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream") if stream.meta["content_type"] == "text/html": assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8") + assert stream.mime_type == "text/html" elif stream.meta["content_type"] == "application/pdf": assert len(stream.data) > 0 + assert stream.mime_type == "application/pdf" @pytest.mark.integration def test_mix_of_good_and_failed_requests(self): @@ -184,6 +195,7 @@ def test_mix_of_good_and_failed_requests(self): assert len(result["streams"]) == 1 first_stream = result["streams"][0] assert first_stream.meta["content_type"] == "text/html" + assert first_stream.mime_type == "text/html" @pytest.mark.integration def test_bad_request_exception_raised(self): @@ -201,4 +213,5 @@ def test_link_content_fetcher_audio(self): streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"] first_stream = streams[0] assert first_stream.meta["content_type"] == "audio/mpeg" + assert first_stream.mime_type == "audio/mpeg" assert len(first_stream.data) > 0