From 6f983a22ca4bfb58c066e05ac741afc6a3207d8b Mon Sep 17 00:00:00 2001
From: Anton Pelykh <pelyh11@gmail.com>
Date: Mon, 9 Dec 2024 15:51:14 +0100
Subject: [PATCH] fix: add missing stream mime type assignment to the
 `LinkContentFetcher` (#8596)

* add missing stream mime type assignment to the `LinkContentFetcher`

* fix release note fmt

---------

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
---
 haystack/components/fetchers/link_content.py        |  1 +
 ...o-the-link-content-fetcher-7581e2728e2130b8.yaml |  8 ++++++++
 .../fetchers/test_link_content_fetcher.py           | 13 +++++++++++++
 3 files changed, 22 insertions(+)
 create mode 100644 releasenotes/notes/add-missing-mime-type-assignment-to-the-link-content-fetcher-7581e2728e2130b8.yaml
diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py
index 38712d683f..fb85d11485 100644
--- a/haystack/components/fetchers/link_content.py
+++ b/haystack/components/fetchers/link_content.py
@@ -151,6 +151,7 @@ def run(self, urls: List[str]):
         if len(urls) == 1:
             stream_metadata, stream = self._fetch(urls[0])
             stream.meta.update(stream_metadata)
+            stream.mime_type = stream.meta.get("content_type", None)
             streams.append(stream)
         else:
             with ThreadPoolExecutor() as executor:
diff --git a/releasenotes/notes/add-missing-mime-type-assignment-to-the-link-content-fetcher-7581e2728e2130b8.yaml b/releasenotes/notes/add-missing-mime-type-assignment-to-the-link-content-fetcher-7581e2728e2130b8.yaml
new file mode 100644
index 0000000000..6067fa3140
--- /dev/null
+++ b/releasenotes/notes/add-missing-mime-type-assignment-to-the-link-content-fetcher-7581e2728e2130b8.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Add missing stream mime type assignment to the `LinkContentFetcher` for
+    the single url scenario.
+
+    Previously the pipelines that use `FileTypeRouter` could fail if receive
+    a single url as an input.
diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py
index 35cbd5e40c..6c2b5e4bc3 100644
--- a/test/components/fetchers/test_link_content_fetcher.py
+++ b/test/components/fetchers/test_link_content_fetcher.py
@@ -74,6 +74,7 @@ def test_run_text(self):
             first_stream = streams[0]
             assert first_stream.data == correct_response
             assert first_stream.meta["content_type"] == "text/plain"
+            assert first_stream.mime_type == "text/plain"
 
     def test_run_html(self):
         correct_response = b"<h1>Example test response</h1>"
@@ -86,6 +87,7 @@ def test_run_html(self):
             first_stream = streams[0]
             assert first_stream.data == correct_response
             assert first_stream.meta["content_type"] == "text/html"
+            assert first_stream.mime_type == "text/html"
 
     def test_run_binary(self, test_files_path):
         file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
@@ -98,6 +100,7 @@ def test_run_binary(self, test_files_path):
             first_stream = streams[0]
             assert first_stream.data == file_bytes
             assert first_stream.meta["content_type"] == "application/pdf"
+            assert first_stream.mime_type == "application/pdf"
 
     def test_run_bad_status_code(self):
         empty_byte_stream = b""
@@ -112,6 +115,7 @@ def test_run_bad_status_code(self):
         first_stream = streams[0]
         assert first_stream.data == empty_byte_stream
         assert first_stream.meta["content_type"] == "text/html"
+        assert first_stream.mime_type == "text/html"
 
     @pytest.mark.integration
     def test_link_content_fetcher_html(self):
@@ -121,6 +125,7 @@ def test_link_content_fetcher_html(self):
         assert "Haystack" in first_stream.data.decode("utf-8")
         assert first_stream.meta["content_type"] == "text/html"
         assert "url" in first_stream.meta and first_stream.meta["url"] == HTML_URL
+        assert first_stream.mime_type == "text/html"
 
     @pytest.mark.integration
     def test_link_content_fetcher_text(self):
@@ -130,6 +135,7 @@ def test_link_content_fetcher_text(self):
         assert "Haystack" in first_stream.data.decode("utf-8")
         assert first_stream.meta["content_type"] == "text/plain"
         assert "url" in first_stream.meta and first_stream.meta["url"] == TEXT_URL
+        assert first_stream.mime_type == "text/plain"
 
     @pytest.mark.integration
     def test_link_content_fetcher_pdf(self):
@@ -139,6 +145,7 @@ def test_link_content_fetcher_pdf(self):
         first_stream = streams[0]
         assert first_stream.meta["content_type"] in ("application/octet-stream", "application/pdf")
         assert "url" in first_stream.meta and first_stream.meta["url"] == PDF_URL
+        assert first_stream.mime_type in ("application/octet-stream", "application/pdf")
 
     @pytest.mark.integration
     def test_link_content_fetcher_multiple_different_content_types(self):
@@ -152,8 +159,10 @@ def test_link_content_fetcher_multiple_different_content_types(self):
             assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
             if stream.meta["content_type"] == "text/html":
                 assert "Haystack" in stream.data.decode("utf-8")
+                assert stream.mime_type == "text/html"
             elif stream.meta["content_type"] == "application/pdf":
                 assert len(stream.data) > 0
+                assert stream.mime_type == "application/pdf"
 
     @pytest.mark.integration
     def test_link_content_fetcher_multiple_html_streams(self):
@@ -169,8 +178,10 @@ def test_link_content_fetcher_multiple_html_streams(self):
             assert stream.meta["content_type"] in ("text/html", "application/pdf", "application/octet-stream")
             if stream.meta["content_type"] == "text/html":
                 assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8")
+                assert stream.mime_type == "text/html"
             elif stream.meta["content_type"] == "application/pdf":
                 assert len(stream.data) > 0
+                assert stream.mime_type == "application/pdf"
 
     @pytest.mark.integration
     def test_mix_of_good_and_failed_requests(self):
@@ -184,6 +195,7 @@ def test_mix_of_good_and_failed_requests(self):
         assert len(result["streams"]) == 1
         first_stream = result["streams"][0]
         assert first_stream.meta["content_type"] == "text/html"
+        assert first_stream.mime_type == "text/html"
 
     @pytest.mark.integration
     def test_bad_request_exception_raised(self):
@@ -201,4 +213,5 @@ def test_link_content_fetcher_audio(self):
         streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"]
         first_stream = streams[0]
         assert first_stream.meta["content_type"] == "audio/mpeg"
+        assert first_stream.mime_type == "audio/mpeg"
         assert len(first_stream.data) > 0