From 142c1fd1837bdd76a780daf2697bea901659798f Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Sat, 19 Oct 2024 22:23:07 -0400 Subject: [PATCH 01/45] begin rough draft --- .../components/reader/__init__.py | 6 + .../components/reader/reader.py | 124 ++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 integrations/jina/src/haystack_integrations/components/reader/__init__.py create mode 100644 integrations/jina/src/haystack_integrations/components/reader/reader.py diff --git a/integrations/jina/src/haystack_integrations/components/reader/__init__.py b/integrations/jina/src/haystack_integrations/components/reader/__init__.py new file mode 100644 index 000000000..d75e247e1 --- /dev/null +++ b/integrations/jina/src/haystack_integrations/components/reader/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from .reader import JinaReader + +__all__ = ["JinaReader"] \ No newline at end of file diff --git a/integrations/jina/src/haystack_integrations/components/reader/reader.py b/integrations/jina/src/haystack_integrations/components/reader/reader.py new file mode 100644 index 000000000..43cf173b5 --- /dev/null +++ b/integrations/jina/src/haystack_integrations/components/reader/reader.py @@ -0,0 +1,124 @@ +######## Jina Reader Docs Implementation ######## +######## Read ######## +import requests + +url = 'https://r.jina.ai/https://example.com' +headers = { + 'Authorization': 'Bearer jina_123456' +} + +response = requests.get(url, headers=headers) +print(response.text) + +######### Search ########## +url = 'https://s.jina.ai/When%20was%20Jina%20AI%20founded?' +headers = { + 'Authorization': 'Bearer jina_123456' +} + +response = requests.get(url, headers=headers) + +print(response.text) + +###### Ground ####### +url = 'https://g.jina.ai/Jina%20AI%20was%20founded%20in%202020%20in%20Berlin.' +headers = { + 'Accept': 'application/json', + 'Authorization': 'Bearer jina_123456' +} + +response = requests.get(url, headers=headers) + +print(response.text) +##### End Jina Docs ###### + +##### Haystack Implementation ###### +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from typing import Any, Dict, List, Optional, Union + +import requests +import urllib +from enum import Enum +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace + + +class JinaReaderMode(Enum): + """ + Specifies how inputs to the NVIDIA embedding components are truncated. + If START, the input will be truncated from the start. + If END, the input will be truncated from the end. + If NONE, an error will be returned (if the input is too long). + """ + + READ = "READ" + SEARCH = "SEARCH" + GROUND = "GROUND" + + def __str__(self): + return self.value + + @classmethod + def from_str(cls, string: str) -> "JinaReaderMode": + """ + Create the reader mode from a string. + + :param string: + String to convert. + :returns: + Reader mode. + """ + enum_map = {e.value: e for e in JinaReaderMode} + reader_mode = enum_map.get(string) + if reader_mode is None: + msg = f"Unknown reader mode '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return reader_mode + +@component +class JinaReader(): + + def __init__( + self, + api_key: Secret = Secret.from_env_var("JINA_API_KEY"), + mode: Union[JinaReaderMode, str], + url: Optional[str], + reader_query: Optional[str] + ): + + resolved_api_key = api_key.resolve_value() + self.api_key = api_key + self.mode = mode + self.url = url + self.reader_query = reader_query + + self._session = requests.Session() + self._session.headers.update( + { + "Authorization": f"Bearer {resolved_api_key}", + "Accept-Encoding": "identity", + "Content-type": "application/json", + } + ) + + @component.output_types(document=Document) + def run(self, input:str): + + # check input depending on mode + mode_map = { + JinaReaderMode.READ: "r", + JinaReaderMode.SEARCH: "s", + JinaReaderMode.GROUND: "g" + } + base_url = "https://{}.jina.ai/".format(mode_map[self.mode]) + encoded_target = urllib.parse.quote(input, safe="") + url = f"{base_url}{encoded_target}" + response = self._session.get( + url, + headers=headers + ) + + ... # do the rest and clean ups + \ No newline at end of file From 2209946db353da746927a6969efcb79d000f53c9 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Sat, 19 Oct 2024 22:24:12 -0400 Subject: [PATCH 02/45] begin rough draft --- .../jina/src/haystack_integrations/components/reader/reader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/jina/src/haystack_integrations/components/reader/reader.py b/integrations/jina/src/haystack_integrations/components/reader/reader.py index 43cf173b5..a2f2be57e 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/reader.py @@ -29,6 +29,7 @@ response = requests.get(url, headers=headers) + print(response.text) ##### End Jina Docs ###### From 0220cdc4b111adcf14e73852efddf6b397d8de13 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Sat, 19 Oct 2024 22:26:15 -0400 Subject: [PATCH 03/45] begin rough draft --- .../components/reader/{ => jina}/__init__.py | 0 .../haystack_integrations/components/reader/{ => jina}/reader.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename integrations/jina/src/haystack_integrations/components/reader/{ => jina}/__init__.py (100%) rename integrations/jina/src/haystack_integrations/components/reader/{ => jina}/reader.py (100%) diff --git a/integrations/jina/src/haystack_integrations/components/reader/__init__.py b/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/reader/__init__.py rename to integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py diff --git a/integrations/jina/src/haystack_integrations/components/reader/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/reader/reader.py rename to integrations/jina/src/haystack_integrations/components/reader/jina/reader.py From 6b0d287592d16c6a7a15a16590997374af9396cb Mon Sep 17 00:00:00 2001 From: anitha6g Date: Sun, 20 Oct 2024 21:59:19 -0400 Subject: [PATCH 04/45] small fixes --- .../components/reader/reader.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/reader.py b/integrations/jina/src/haystack_integrations/components/reader/reader.py index 43cf173b5..4e1285e4d 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/reader.py @@ -47,10 +47,12 @@ class JinaReaderMode(Enum): """ - Specifies how inputs to the NVIDIA embedding components are truncated. - If START, the input will be truncated from the start. - If END, the input will be truncated from the end. - If NONE, an error will be returned (if the input is too long). + Enum representing modes for the Jina Reader. + + Modes: + READ: For reading documents. + SEARCH: For searching within documents. + GROUND: For grounding or fact checking. """ READ = "READ" @@ -82,10 +84,10 @@ class JinaReader(): def __init__( self, - api_key: Secret = Secret.from_env_var("JINA_API_KEY"), mode: Union[JinaReaderMode, str], url: Optional[str], - reader_query: Optional[str] + reader_query: Optional[str], + api_key: Secret = Secret.from_env_var("JINA_API_KEY"), ): resolved_api_key = api_key.resolve_value() @@ -116,8 +118,7 @@ def run(self, input:str): encoded_target = urllib.parse.quote(input, safe="") url = f"{base_url}{encoded_target}" response = self._session.get( - url, - headers=headers + url ) ... # do the rest and clean ups From e4d61157a3cbbf11fab7dd455c2cff02ff2ff794 Mon Sep 17 00:00:00 2001 From: anitha6g Date: Mon, 21 Oct 2024 08:23:21 -0400 Subject: [PATCH 05/45] Haystack document conversion --- .../components/reader/reader.py | 43 ++++--------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/reader.py b/integrations/jina/src/haystack_integrations/components/reader/reader.py index 4e1285e4d..3bb220ed8 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/reader.py @@ -1,36 +1,3 @@ -######## Jina Reader Docs Implementation ######## -######## Read ######## -import requests - -url = 'https://r.jina.ai/https://example.com' -headers = { - 'Authorization': 'Bearer jina_123456' -} - -response = requests.get(url, headers=headers) -print(response.text) - -######### Search ########## -url = 'https://s.jina.ai/When%20was%20Jina%20AI%20founded?' -headers = { - 'Authorization': 'Bearer jina_123456' -} - -response = requests.get(url, headers=headers) - -print(response.text) - -###### Ground ####### -url = 'https://g.jina.ai/Jina%20AI%20was%20founded%20in%202020%20in%20Berlin.' -headers = { - 'Accept': 'application/json', - 'Authorization': 'Bearer jina_123456' -} - -response = requests.get(url, headers=headers) - -print(response.text) -##### End Jina Docs ###### ##### Haystack Implementation ###### # SPDX-FileCopyrightText: 2023-present deepset GmbH @@ -85,8 +52,8 @@ class JinaReader(): def __init__( self, mode: Union[JinaReaderMode, str], - url: Optional[str], - reader_query: Optional[str], + url: Optional[str] = None, + reader_query: Optional[str] = None, api_key: Secret = Secret.from_env_var("JINA_API_KEY"), ): @@ -120,6 +87,12 @@ def run(self, input:str): response = self._session.get( url ) + metadata = { + 'content_type': response.headers['Content-Type'], + 'url':input + } + document = [Document(content = response.content, meta = metadata)] + return document ... # do the rest and clean ups \ No newline at end of file From ad626f8ede8be9c71d49b85dcc4f96cf927ab410 Mon Sep 17 00:00:00 2001 From: anitha6g Date: Mon, 21 Oct 2024 15:52:05 -0400 Subject: [PATCH 06/45] git folder changes --- .../components/reader/{ => jina}/__init__.py | 0 .../haystack_integrations/components/reader/{ => jina}/reader.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename integrations/jina/src/haystack_integrations/components/reader/{ => jina}/__init__.py (100%) rename integrations/jina/src/haystack_integrations/components/reader/{ => jina}/reader.py (100%) diff --git a/integrations/jina/src/haystack_integrations/components/reader/__init__.py b/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/reader/__init__.py rename to integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py diff --git a/integrations/jina/src/haystack_integrations/components/reader/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/reader/reader.py rename to integrations/jina/src/haystack_integrations/components/reader/jina/reader.py From f71d7b46379d6d7507cedb09a867e5cc22607ff7 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 21 Oct 2024 18:36:02 -0400 Subject: [PATCH 07/45] add pipeline functions --- .../components/reader/jina/reader.py | 65 ++++++++++--------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 748d4e5d2..54adab824 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -2,11 +2,11 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 +import urllib +from enum import Enum from typing import Any, Dict, List, Optional, Union import requests -import urllib -from enum import Enum from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace @@ -46,22 +46,17 @@ def from_str(cls, string: str) -> "JinaReaderMode": raise ValueError(msg) return reader_mode -@component -class JinaReader(): +@component +class JinaReader: def __init__( self, mode: Union[JinaReaderMode, str], - url: Optional[str] = None, - reader_query: Optional[str] = None, api_key: Secret = Secret.from_env_var("JINA_API_KEY"), ): - resolved_api_key = api_key.resolve_value() self.api_key = api_key self.mode = mode - self.url = url - self.reader_query = reader_query self._session = requests.Session() self._session.headers.update( @@ -72,27 +67,39 @@ def __init__( } ) - @component.output_types(document=Document) - def run(self, input:str): + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + api_key=self.api_key.to_dict(), + mode=self.mode, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "JinaReader": + """ + Deserializes the component from a dictionary. + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + @component.output_types(document=Document) + def run(self, query: str): # check input depending on mode - mode_map = { - JinaReaderMode.READ: "r", - JinaReaderMode.SEARCH: "s", - JinaReaderMode.GROUND: "g" - } - base_url = "https://{}.jina.ai/".format(mode_map[self.mode]) - encoded_target = urllib.parse.quote(input, safe="") + mode_map = {JinaReaderMode.READ: "r", JinaReaderMode.SEARCH: "s", JinaReaderMode.GROUND: "g"} + mode = mode_map[self.mode] + base_url = f"https://{mode}.jina.ai/" + encoded_target = urllib.parse.quote(query, safe="") url = f"{base_url}{encoded_target}" - response = self._session.get( - url - ) - metadata = { - 'content_type': response.headers['Content-Type'], - 'url':input - } - document = [Document(content = response.content, meta = metadata)] + response = self._session.get(url) + metadata = {"content_type": response.headers["Content-Type"], "url": input} + document = [Document(content=response.content, meta=metadata)] return document - - ... # do the rest and clean ups - \ No newline at end of file From 6b198a84129d578662199b12066d008021c4de92 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 21 Oct 2024 18:46:13 -0400 Subject: [PATCH 08/45] correct mode map --- .../src/haystack_integrations/components/reader/jina/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 54adab824..c2a286813 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -94,7 +94,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaReader": @component.output_types(document=Document) def run(self, query: str): # check input depending on mode - mode_map = {JinaReaderMode.READ: "r", JinaReaderMode.SEARCH: "s", JinaReaderMode.GROUND: "g"} + mode_map = {"READ": "r", "SEARCH": "s", "GROUND": "g"} mode = mode_map[self.mode] base_url = f"https://{mode}.jina.ai/" encoded_target = urllib.parse.quote(query, safe="") From aa03e4d505dd2cfc6e443a15ac514e1f0d694bc6 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 21 Oct 2024 20:50:48 -0400 Subject: [PATCH 09/45] add reader mode Enum class file --- .../components/reader/jina/__init__.py | 2 +- .../components/reader/jina/reader.py | 44 ++----------------- .../components/reader/jina/reader_mode.py | 41 +++++++++++++++++ 3 files changed, 46 insertions(+), 41 deletions(-) create mode 100644 integrations/jina/src/haystack_integrations/components/reader/jina/reader_mode.py diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py b/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py index d75e247e1..7335b8c0e 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py @@ -3,4 +3,4 @@ # SPDX-License-Identifier: Apache-2.0 from .reader import JinaReader -__all__ = ["JinaReader"] \ No newline at end of file +__all__ = ["JinaReader"] diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index c2a286813..d1162fef3 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -2,49 +2,14 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 + import urllib -from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, Union import requests from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace - - -class JinaReaderMode(Enum): - """ - Enum representing modes for the Jina Reader. - - Modes: - READ: For reading documents. - SEARCH: For searching within documents. - GROUND: For grounding or fact checking. - - """ - - READ = "READ" - SEARCH = "SEARCH" - GROUND = "GROUND" - - def __str__(self): - return self.value - - @classmethod - def from_str(cls, string: str) -> "JinaReaderMode": - """ - Create the reader mode from a string. - - :param string: - String to convert. - :returns: - Reader mode. - """ - enum_map = {e.value: e for e in JinaReaderMode} - reader_mode = enum_map.get(string) - if reader_mode is None: - msg = f"Unknown reader mode '{string}'. Supported modes are: {list(enum_map.keys())}" - raise ValueError(msg) - return reader_mode +from reader_mode import JinaReaderMode @component @@ -93,13 +58,12 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaReader": @component.output_types(document=Document) def run(self, query: str): - # check input depending on mode mode_map = {"READ": "r", "SEARCH": "s", "GROUND": "g"} mode = mode_map[self.mode] base_url = f"https://{mode}.jina.ai/" encoded_target = urllib.parse.quote(query, safe="") url = f"{base_url}{encoded_target}" response = self._session.get(url) - metadata = {"content_type": response.headers["Content-Type"], "url": input} + metadata = {"content_type": response.headers["Content-Type"], "query": query} document = [Document(content=response.content, meta=metadata)] return document diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader_mode.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader_mode.py new file mode 100644 index 000000000..6dff4cac3 --- /dev/null +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader_mode.py @@ -0,0 +1,41 @@ +##### Haystack Implementation ###### +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from enum import Enum + + +class JinaReaderMode(Enum): + """ + Enum representing modes for the Jina Reader. + + Modes: + READ: For reading documents. + SEARCH: For searching within documents. + GROUND: For grounding or fact checking. + + """ + + READ = "READ" + SEARCH = "SEARCH" + GROUND = "GROUND" + + def __str__(self): + return self.value + + @classmethod + def from_str(cls, string: str) -> "JinaReaderMode": + """ + Create the reader mode from a string. + + :param string: + String to convert. + :returns: + Reader mode. + """ + enum_map = {e.value: e for e in JinaReaderMode} + reader_mode = enum_map.get(string) + if reader_mode is None: + msg = f"Unknown reader mode '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return reader_mode From 588bc66804f72ecf1675fa596ff05e84c4704889 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 21 Oct 2024 20:58:31 -0400 Subject: [PATCH 10/45] add docstrings --- .../components/reader/jina/reader.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index d1162fef3..42706702e 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -14,11 +14,35 @@ @component class JinaReader: + """ + A component that interacts with Jina AI's reader service to process queries and return documents. + + This component supports different modes of operation: READ, SEARCH, and GROUND. + + Usage example: + ```python + from haystack import Document + from haystack_integrations.components.readers.jina import JinaReader + + reader = JinaReader(mode="READ") + query = "https://example.com" + result = reader.run(query=query) + document = result["document"] + print(document.content) + ``` + """ def __init__( self, mode: Union[JinaReaderMode, str], api_key: Secret = Secret.from_env_var("JINA_API_KEY"), ): + """ + Initialize a JinaReader instance. + + :param mode: The operation mode for the reader (READ, SEARCH, or GROUND). + :param api_key: The Jina API key. It can be explicitly provided or automatically read from the + environment variable JINA_API_KEY (recommended). + """ resolved_api_key = api_key.resolve_value() self.api_key = api_key self.mode = mode @@ -58,6 +82,12 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaReader": @component.output_types(document=Document) def run(self, query: str): + """ + Process the query using the Jina AI reader service. + + :param query: The query string or URL to process. + :returns: A list containing a single Document object with the processed content and metadata. + """ mode_map = {"READ": "r", "SEARCH": "s", "GROUND": "g"} mode = mode_map[self.mode] base_url = f"https://{mode}.jina.ai/" From 1eb75f9d9ef347d7ad4c8b9a2cd574b619240495 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 21 Oct 2024 21:07:41 -0400 Subject: [PATCH 11/45] add JINA url for ref --- .../haystack_integrations/components/reader/jina/reader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 42706702e..761dcede2 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -9,7 +9,8 @@ import requests from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace -from reader_mode import JinaReaderMode + +from haystack_integrations.components.reader.jina import JinaReaderMode @component @@ -31,6 +32,7 @@ class JinaReader: print(document.content) ``` """ + def __init__( self, mode: Union[JinaReaderMode, str], @@ -39,7 +41,7 @@ def __init__( """ Initialize a JinaReader instance. - :param mode: The operation mode for the reader (READ, SEARCH, or GROUND). + :param mode: The operation mode for the reader (READ, SEARCH, or GROUND). See each Mode and its function here https://jina.ai/reader/ :param api_key: The Jina API key. It can be explicitly provided or automatically read from the environment variable JINA_API_KEY (recommended). """ From b8f0a3f423e8c8718a00f49684ec1642c32a5954 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 21 Oct 2024 21:16:31 -0400 Subject: [PATCH 12/45] add mode norm for mode map check in run method --- .../haystack_integrations/components/reader/jina/reader.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 761dcede2..5289082f1 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -58,6 +58,11 @@ def __init__( } ) + if isinstance(mode, JinaReaderMode): + self.mode = mode.value + elif isinstance(mode, str): + self.mode = mode.upper() + def to_dict(self) -> Dict[str, Any]: """ Serializes the component to a dictionary. From faa8d17a6ca1f1ac683b81ec3843f4104db11d3c Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 21 Oct 2024 21:18:01 -0400 Subject: [PATCH 13/45] add mode norm for mode map check in run method --- .../src/haystack_integrations/components/reader/jina/reader.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 5289082f1..bd1720e29 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -47,8 +47,6 @@ def __init__( """ resolved_api_key = api_key.resolve_value() self.api_key = api_key - self.mode = mode - self._session = requests.Session() self._session.headers.update( { From a8e6a5b613f21bcc83a5dd148a760e671d7c4a3e Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Tue, 29 Oct 2024 22:38:16 -0400 Subject: [PATCH 14/45] add json_response and associated parsing --- .../components/reader/jina/__init__.py | 4 +-- .../components/reader/jina/reader.py | 35 ++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py b/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py index 7335b8c0e..3d4519b35 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from .reader import JinaReader +from .reader import JinaReaderConnector -__all__ = ["JinaReader"] +__all__ = ["JinaReaderConnector"] diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index bd1720e29..6dc2fda31 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import json import urllib from typing import Any, Dict, Union @@ -14,7 +15,7 @@ @component -class JinaReader: +class JinaReaderConnector: """ A component that interacts with Jina AI's reader service to process queries and return documents. @@ -37,6 +38,7 @@ def __init__( self, mode: Union[JinaReaderMode, str], api_key: Secret = Secret.from_env_var("JINA_API_KEY"), + json_response: bool = True, ): """ Initialize a JinaReader instance. @@ -48,6 +50,7 @@ def __init__( resolved_api_key = api_key.resolve_value() self.api_key = api_key self._session = requests.Session() + self.json_response = json_response self._session.headers.update( { "Authorization": f"Bearer {resolved_api_key}", @@ -56,6 +59,9 @@ def __init__( } ) + if self.json_response: + self._session.headers.update({"Accept": "application/json"}) + if isinstance(mode, JinaReaderMode): self.mode = mode.value elif isinstance(mode, str): @@ -71,10 +77,11 @@ def to_dict(self) -> Dict[str, Any]: self, api_key=self.api_key.to_dict(), mode=self.mode, + json_response=self.json_response, ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "JinaReader": + def from_dict(cls, data: Dict[str, Any]) -> "JinaReaderConnector": """ Deserializes the component from a dictionary. :param data: @@ -85,6 +92,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaReader": deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) + # TODO add headers param @component.output_types(document=Document) def run(self, query: str): """ @@ -99,6 +107,25 @@ def run(self, query: str): encoded_target = urllib.parse.quote(query, safe="") url = f"{base_url}{encoded_target}" response = self._session.get(url) - metadata = {"content_type": response.headers["Content-Type"], "query": query} - document = [Document(content=response.content, meta=metadata)] + + if self.json_response: + if self.mode == "READ": + response_json = json.loads(response.content).get("data", {}) + content = response_json.pop("content") + document = [Document(content=content, meta=response_json)] + if self.mode == "SEARCH": + document = [] + response_json = json.loads(response.content).get("data", {}) + for record in response_json: + content = record.pop("content") + doc = [Document(content=content, meta=record)] + document.append(doc) + return document + else: + response_json = json.loads(response.content).get("data", {}) + content = response_json.pop("reason") + document = [Document(content=content, meta=response_json)] + else: + metadata = {"content_type": response.headers["Content-Type"], "query": query} + document = [Document(content=response.content, meta=metadata)] return document From 72e29f40242a27a4a16da317531935de2944b2d3 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Tue, 29 Oct 2024 22:42:26 -0400 Subject: [PATCH 15/45] ignore api key lint error --- .../src/haystack_integrations/components/reader/jina/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 6dc2fda31..4611a0d79 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -37,7 +37,7 @@ class JinaReaderConnector: def __init__( self, mode: Union[JinaReaderMode, str], - api_key: Secret = Secret.from_env_var("JINA_API_KEY"), + api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008 json_response: bool = True, ): """ From 09d889109e46777701e7c18fc1c27e817e9516f5 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Tue, 29 Oct 2024 22:44:28 -0400 Subject: [PATCH 16/45] ignore api key lint error --- .../src/haystack_integrations/components/reader/jina/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 4611a0d79..481f4bbb8 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -37,7 +37,7 @@ class JinaReaderConnector: def __init__( self, mode: Union[JinaReaderMode, str], - api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008 + api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008 json_response: bool = True, ): """ From ff36a9ac20d91adb5c854e899b771006777385c2 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Tue, 29 Oct 2024 23:02:54 -0400 Subject: [PATCH 17/45] reduce code redundancy --- .../components/reader/jina/reader.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 481f4bbb8..fa2a868c5 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -5,7 +5,7 @@ import json import urllib -from typing import Any, Dict, Union +from typing import Any, Dict, List, Union import requests from haystack import Document, component, default_from_dict, default_to_dict @@ -92,8 +92,16 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaReaderConnector": deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) + def parse_json_response(self, data: dict) -> Document: + if self.mode == "GROUND": + content = data.pop("reason") + else: + content = data.pop("content") + document = Document(content=content, meta=data) + return document + # TODO add headers param - @component.output_types(document=Document) + @component.output_types(document=List[Document]) def run(self, query: str): """ Process the query using the Jina AI reader service. @@ -109,22 +117,12 @@ def run(self, query: str): response = self._session.get(url) if self.json_response: - if self.mode == "READ": - response_json = json.loads(response.content).get("data", {}) - content = response_json.pop("content") - document = [Document(content=content, meta=response_json)] + response_json = json.loads(response.content).get("data", {}) if self.mode == "SEARCH": - document = [] - response_json = json.loads(response.content).get("data", {}) - for record in response_json: - content = record.pop("content") - doc = [Document(content=content, meta=record)] - document.append(doc) - return document + documents = [self.parse_json_response(record) for record in response_json] + return documents else: - response_json = json.loads(response.content).get("data", {}) - content = response_json.pop("reason") - document = [Document(content=content, meta=response_json)] + return [self.parse_json_response(response_json)] else: metadata = {"content_type": response.headers["Content-Type"], "query": query} document = [Document(content=response.content, meta=metadata)] From f7623e2cb3be8f9ec8d6125a78b9e8af72aff305 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Tue, 29 Oct 2024 23:06:10 -0400 Subject: [PATCH 18/45] reduce code redundancy --- .../haystack_integrations/components/reader/jina/reader.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index fa2a868c5..89be88cba 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -121,9 +121,8 @@ def run(self, query: str): if self.mode == "SEARCH": documents = [self.parse_json_response(record) for record in response_json] return documents - else: - return [self.parse_json_response(response_json)] + return [self.parse_json_response(response_json)] else: metadata = {"content_type": response.headers["Content-Type"], "query": query} - document = [Document(content=response.content, meta=metadata)] - return document + documents = [Document(content=response.content, meta=metadata)] + return documents From 2f3afb03538fa91f4cfae0995e91a6eb6e649a0a Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Thu, 31 Oct 2024 20:08:54 -0400 Subject: [PATCH 19/45] add headers option to run method --- .../haystack_integrations/components/reader/jina/reader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 89be88cba..a88c08683 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -5,7 +5,7 @@ import json import urllib -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union import requests from haystack import Document, component, default_from_dict, default_to_dict @@ -102,13 +102,15 @@ def parse_json_response(self, data: dict) -> Document: # TODO add headers param @component.output_types(document=List[Document]) - def run(self, query: str): + def run(self, query: str, headers: Optional[Dict[str, str]] = None): """ Process the query using the Jina AI reader service. :param query: The query string or URL to process. :returns: A list containing a single Document object with the processed content and metadata. """ + if headers: + self._session.headers.update(headers) mode_map = {"READ": "r", "SEARCH": "s", "GROUND": "g"} mode = mode_map[self.mode] base_url = f"https://{mode}.jina.ai/" From f10d0b2dc0cea713fd461f0b181d955ae4fbb884 Mon Sep 17 00:00:00 2001 From: jlonge4 <91354480+jlonge4@users.noreply.github.com> Date: Fri, 1 Nov 2024 08:43:18 -0400 Subject: [PATCH 20/45] Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py Co-authored-by: Stefano Fiorucci --- .../src/haystack_integrations/components/reader/jina/reader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index a88c08683..8a81b4879 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -55,7 +55,6 @@ def __init__( { "Authorization": f"Bearer {resolved_api_key}", "Accept-Encoding": "identity", - "Content-type": "application/json", } ) From cb183de55005862e55c29ed0fa60eb2767fd9912 Mon Sep 17 00:00:00 2001 From: jlonge4 <91354480+jlonge4@users.noreply.github.com> Date: Fri, 1 Nov 2024 08:43:46 -0400 Subject: [PATCH 21/45] Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py Co-authored-by: Stefano Fiorucci --- .../haystack_integrations/components/reader/jina/reader.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 8a81b4879..1c5216318 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -61,10 +61,8 @@ def __init__( if self.json_response: self._session.headers.update({"Accept": "application/json"}) - if isinstance(mode, JinaReaderMode): - self.mode = mode.value - elif isinstance(mode, str): - self.mode = mode.upper() + if isinstance(mode, str): + self.mode = JinaReaderMode.from_str(mode) def to_dict(self) -> Dict[str, Any]: """ From 9a955ba9a43ac3e64dd7eccb6b75e16418ce4cb8 Mon Sep 17 00:00:00 2001 From: jlonge4 <91354480+jlonge4@users.noreply.github.com> Date: Fri, 1 Nov 2024 08:44:18 -0400 Subject: [PATCH 22/45] Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py Co-authored-by: Stefano Fiorucci --- .../src/haystack_integrations/components/reader/jina/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 1c5216318..2dba0a98a 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -90,7 +90,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaReaderConnector": return default_from_dict(cls, data) def parse_json_response(self, data: dict) -> Document: - if self.mode == "GROUND": + if self.mode == JinaReaderMode.GROUND: content = data.pop("reason") else: content = data.pop("content") From 3ca7b1beaa38a8d5ba4da7265e0fcd14f9d65f3b Mon Sep 17 00:00:00 2001 From: jlonge4 <91354480+jlonge4@users.noreply.github.com> Date: Fri, 1 Nov 2024 08:46:07 -0400 Subject: [PATCH 23/45] Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py Co-authored-by: Stefano Fiorucci --- .../src/haystack_integrations/components/reader/jina/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py index 2dba0a98a..08b96f7da 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py @@ -73,7 +73,7 @@ def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, api_key=self.api_key.to_dict(), - mode=self.mode, + mode=str(self.mode), json_response=self.json_response, ) From 91c4d6e3483ac255b2e90f0fbefadb503ffed66e Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Fri, 1 Nov 2024 08:49:38 -0400 Subject: [PATCH 24/45] update location / final edits --- .../components/{reader => converters}/jina/__init__.py | 0 .../components/{reader => converters}/jina/reader.py | 2 +- .../components/{reader => converters}/jina/reader_mode.py | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename integrations/jina/src/haystack_integrations/components/{reader => converters}/jina/__init__.py (100%) rename integrations/jina/src/haystack_integrations/components/{reader => converters}/jina/reader.py (97%) rename integrations/jina/src/haystack_integrations/components/{reader => converters}/jina/reader_mode.py (100%) diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py b/integrations/jina/src/haystack_integrations/components/converters/jina/__init__.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/reader/jina/__init__.py rename to integrations/jina/src/haystack_integrations/components/converters/jina/__init__.py diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py b/integrations/jina/src/haystack_integrations/components/converters/jina/reader.py similarity index 97% rename from integrations/jina/src/haystack_integrations/components/reader/jina/reader.py rename to integrations/jina/src/haystack_integrations/components/converters/jina/reader.py index 08b96f7da..0b7ab701e 100644 --- a/integrations/jina/src/haystack_integrations/components/reader/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/converters/jina/reader.py @@ -108,7 +108,7 @@ def run(self, query: str, headers: Optional[Dict[str, str]] = None): """ if headers: self._session.headers.update(headers) - mode_map = {"READ": "r", "SEARCH": "s", "GROUND": "g"} + mode_map = {JinaReaderMode.READ: "r", JinaReaderMode.SEARCH: "s", JinaReaderMode.GROUND: "g"} mode = mode_map[self.mode] base_url = f"https://{mode}.jina.ai/" encoded_target = urllib.parse.quote(query, safe="") diff --git a/integrations/jina/src/haystack_integrations/components/reader/jina/reader_mode.py b/integrations/jina/src/haystack_integrations/components/converters/jina/reader_mode.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/reader/jina/reader_mode.py rename to integrations/jina/src/haystack_integrations/components/converters/jina/reader_mode.py From bb4f0a4ab5673e958a8bddacc18e1b1c89ec8d08 Mon Sep 17 00:00:00 2001 From: jlonge4 <91354480+jlonge4@users.noreply.github.com> Date: Thu, 7 Nov 2024 20:54:08 -0500 Subject: [PATCH 25/45] Update integrations/jina/src/haystack_integrations/components/converters/jina/reader.py Co-authored-by: Stefano Fiorucci --- .../haystack_integrations/components/converters/jina/reader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/converters/jina/reader.py b/integrations/jina/src/haystack_integrations/components/converters/jina/reader.py index 0b7ab701e..3fb659d04 100644 --- a/integrations/jina/src/haystack_integrations/components/converters/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/converters/jina/reader.py @@ -97,7 +97,6 @@ def parse_json_response(self, data: dict) -> Document: document = Document(content=content, meta=data) return document - # TODO add headers param @component.output_types(document=List[Document]) def run(self, query: str, headers: Optional[Dict[str, str]] = None): """ From fe28a1e8094d83cb47d39428a458bf73f403d415 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Thu, 7 Nov 2024 20:57:33 -0500 Subject: [PATCH 26/45] update paths --- .../components/{converters => connectors}/jina/__init__.py | 0 .../components/{converters => connectors}/jina/reader.py | 0 .../components/{converters => connectors}/jina/reader_mode.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename integrations/jina/src/haystack_integrations/components/{converters => connectors}/jina/__init__.py (100%) rename integrations/jina/src/haystack_integrations/components/{converters => connectors}/jina/reader.py (100%) rename integrations/jina/src/haystack_integrations/components/{converters => connectors}/jina/reader_mode.py (100%) diff --git a/integrations/jina/src/haystack_integrations/components/converters/jina/__init__.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/converters/jina/__init__.py rename to integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py diff --git a/integrations/jina/src/haystack_integrations/components/converters/jina/reader.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/converters/jina/reader.py rename to integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py diff --git a/integrations/jina/src/haystack_integrations/components/converters/jina/reader_mode.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py similarity index 100% rename from integrations/jina/src/haystack_integrations/components/converters/jina/reader_mode.py rename to integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py From 6a66ea14544fcafe1d4258b05af3bab286d234dc Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Thu, 7 Nov 2024 20:59:49 -0500 Subject: [PATCH 27/45] add descriptions for json response/headers --- .../components/connectors/jina/reader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py index 3fb659d04..4027e9ba0 100644 --- a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py @@ -11,7 +11,7 @@ from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace -from haystack_integrations.components.reader.jina import JinaReaderMode +from haystack_integrations.components.connectors.jina import JinaReaderMode @component @@ -24,7 +24,7 @@ class JinaReaderConnector: Usage example: ```python from haystack import Document - from haystack_integrations.components.readers.jina import JinaReader + from haystack_integrations.components.connectors.jina import JinaReader reader = JinaReader(mode="READ") query = "https://example.com" @@ -46,6 +46,8 @@ def __init__( :param mode: The operation mode for the reader (READ, SEARCH, or GROUND). See each Mode and its function here https://jina.ai/reader/ :param api_key: The Jina API key. It can be explicitly provided or automatically read from the environment variable JINA_API_KEY (recommended). + :param json_response: A boolean to indicate whether to return the response in JSON format. Default is True. + If set to False, the response will be in markdown format. """ resolved_api_key = api_key.resolve_value() self.api_key = api_key @@ -103,6 +105,7 @@ def run(self, query: str, headers: Optional[Dict[str, str]] = None): Process the query using the Jina AI reader service. :param query: The query string or URL to process. + :param headers: Optional headers to include in the request. :returns: A list containing a single Document object with the processed content and metadata. """ if headers: From 08f488a9c21a3a15d17a336643dca4e1943d0df8 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Thu, 7 Nov 2024 21:02:38 -0500 Subject: [PATCH 28/45] lint --- .../haystack_integrations/components/connectors/jina/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py index 4027e9ba0..8eec268af 100644 --- a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py @@ -46,7 +46,7 @@ def __init__( :param mode: The operation mode for the reader (READ, SEARCH, or GROUND). See each Mode and its function here https://jina.ai/reader/ :param api_key: The Jina API key. It can be explicitly provided or automatically read from the environment variable JINA_API_KEY (recommended). - :param json_response: A boolean to indicate whether to return the response in JSON format. Default is True. + :param json_response: A boolean to indicate whether to return the response in JSON format. Default is True. If set to False, the response will be in markdown format. """ resolved_api_key = api_key.resolve_value() From 2a73245af3af177b95b53407902266d566e800f1 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 11 Nov 2024 20:06:03 -0500 Subject: [PATCH 29/45] unit tests for reader-connector --- .../jina/tests/test_reader_connector.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 integrations/jina/tests/test_reader_connector.py diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py new file mode 100644 index 000000000..b85afd750 --- /dev/null +++ b/integrations/jina/tests/test_reader_connector.py @@ -0,0 +1,66 @@ +from unittest.mock import patch + +import pytest +from haystack.utils import Secret + +from haystack_integrations.components.connectors.jina import JinaReaderConnector, JinaReaderMode + + +class TestJinaReaderConnector: + + @pytest.fixture + def mock_session(self): + with patch("requests.Session") as mock: + yield mock.return_value + + def test_init_with_custom_parameters(self): + reader = JinaReaderConnector( + mode="READ", + api_key=Secret.from_token("test-api-key"), + json_response=False + ) + + assert reader.mode == JinaReaderMode.READ + assert reader.api_key.resolve_value() == "test-api-key" + assert reader.json_response is False + assert "Accept" not in reader._session.headers + + + def test_to_dict(self): + reader = JinaReaderConnector( + mode="SEARCH", + api_key=Secret.from_token("test-api-key"), + json_response=True + ) + + serialized = reader.to_dict() + + assert serialized["type"] == "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector" + assert "init_parameters" in serialized + + init_params = serialized["init_parameters"] + assert init_params["mode"] == "SEARCH" + assert init_params["json_response"] is True + assert "api_key" in init_params + assert init_params["api_key"]["type"] == "secret" + assert "value" in init_params["api_key"] + + + def test_from_dict(self): + component_dict = { + "type": "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector", + "init_parameters": { + "mode": "GROUND", + "json_response": False, + "api_key": {"type": "secret", "value": "another-test-key"} + } + } + + + reader = JinaReaderConnector.from_dict(component_dict) + + + assert isinstance(reader, JinaReaderConnector) + assert reader.mode == JinaReaderMode.GROUND + assert reader.json_response is False + assert reader.api_key.resolve_value() == "another-test-key" From 8271def438af4fae535bb71ae575883a5ad1bd3c Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 11 Nov 2024 20:07:34 -0500 Subject: [PATCH 30/45] unit tests for reader-connector --- .../jina/tests/test_reader_connector.py | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index b85afd750..30a4cff72 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -7,31 +7,21 @@ class TestJinaReaderConnector: - @pytest.fixture def mock_session(self): with patch("requests.Session") as mock: yield mock.return_value def test_init_with_custom_parameters(self): - reader = JinaReaderConnector( - mode="READ", - api_key=Secret.from_token("test-api-key"), - json_response=False - ) + reader = JinaReaderConnector(mode="READ", api_key=Secret.from_token("test-api-key"), json_response=False) assert reader.mode == JinaReaderMode.READ assert reader.api_key.resolve_value() == "test-api-key" assert reader.json_response is False assert "Accept" not in reader._session.headers - def test_to_dict(self): - reader = JinaReaderConnector( - mode="SEARCH", - api_key=Secret.from_token("test-api-key"), - json_response=True - ) + reader = JinaReaderConnector(mode="SEARCH", api_key=Secret.from_token("test-api-key"), json_response=True) serialized = reader.to_dict() @@ -45,21 +35,18 @@ def test_to_dict(self): assert init_params["api_key"]["type"] == "secret" assert "value" in init_params["api_key"] - def test_from_dict(self): component_dict = { "type": "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector", "init_parameters": { "mode": "GROUND", "json_response": False, - "api_key": {"type": "secret", "value": "another-test-key"} - } + "api_key": {"type": "secret", "value": "another-test-key"}, + }, } - reader = JinaReaderConnector.from_dict(component_dict) - assert isinstance(reader, JinaReaderConnector) assert reader.mode == JinaReaderMode.GROUND assert reader.json_response is False From d1a76c58ffb86b40af4c9f6ef8ad7261344ce4ae Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 11 Nov 2024 20:13:44 -0500 Subject: [PATCH 31/45] unit tests for reader-connector --- .../components/connectors/jina/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py index 3d4519b35..95368df21 100644 --- a/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py @@ -2,5 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 from .reader import JinaReaderConnector +from .reader_mode import JinaReaderMode -__all__ = ["JinaReaderConnector"] +__all__ = ["JinaReaderConnector", "JinaReaderMode"] From 13ed1575dbcf108a2f91218f06a8d44f78832555 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 12 Nov 2024 17:59:28 +0100 Subject: [PATCH 32/45] fix circular import --- .../haystack_integrations/components/connectors/jina/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py index 8eec268af..d64b727da 100644 --- a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py @@ -11,7 +11,7 @@ from haystack import Document, component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace -from haystack_integrations.components.connectors.jina import JinaReaderMode +from .reader_mode import JinaReaderMode @component From d9c8ffe55fd78c20bd073893ce2859879ed74544 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Wed, 13 Nov 2024 09:01:50 -0500 Subject: [PATCH 33/45] update header test --- integrations/jina/tests/test_reader_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index 30a4cff72..93a62afe8 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -18,7 +18,7 @@ def test_init_with_custom_parameters(self): assert reader.mode == JinaReaderMode.READ assert reader.api_key.resolve_value() == "test-api-key" assert reader.json_response is False - assert "Accept" not in reader._session.headers + assert "application/json" not in reader._session.headers.values() def test_to_dict(self): reader = JinaReaderConnector(mode="SEARCH", api_key=Secret.from_token("test-api-key"), json_response=True) From 4d1618803e9ddb70e2e820efdfdab98e52dec967 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Wed, 13 Nov 2024 09:11:32 -0500 Subject: [PATCH 34/45] update test --- integrations/jina/tests/test_reader_connector.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index 93a62afe8..673befa12 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -1,3 +1,4 @@ +import os from unittest.mock import patch import pytest @@ -5,6 +6,7 @@ from haystack_integrations.components.connectors.jina import JinaReaderConnector, JinaReaderMode +os.environ["TEST_KEY"] = "test-api-key" class TestJinaReaderConnector: @pytest.fixture @@ -13,7 +15,7 @@ def mock_session(self): yield mock.return_value def test_init_with_custom_parameters(self): - reader = JinaReaderConnector(mode="READ", api_key=Secret.from_token("test-api-key"), json_response=False) + reader = JinaReaderConnector(mode="READ", api_key=Secret.from_env_var("TEST_KEY"), json_response=False) assert reader.mode == JinaReaderMode.READ assert reader.api_key.resolve_value() == "test-api-key" @@ -21,7 +23,7 @@ def test_init_with_custom_parameters(self): assert "application/json" not in reader._session.headers.values() def test_to_dict(self): - reader = JinaReaderConnector(mode="SEARCH", api_key=Secret.from_token("test-api-key"), json_response=True) + reader = JinaReaderConnector(mode="SEARCH", api_key=Secret.from_env_var("TEST_KEY"), json_response=True) serialized = reader.to_dict() @@ -41,7 +43,7 @@ def test_from_dict(self): "init_parameters": { "mode": "GROUND", "json_response": False, - "api_key": {"type": "secret", "value": "another-test-key"}, + "api_key": Secret.from_env_var("TEST_KEY"), }, } @@ -50,4 +52,4 @@ def test_from_dict(self): assert isinstance(reader, JinaReaderConnector) assert reader.mode == JinaReaderMode.GROUND assert reader.json_response is False - assert reader.api_key.resolve_value() == "another-test-key" + assert reader.api_key.resolve_value() == "test-api-key" From a3b30e1ab477ac488ee3193e4a129c410d50b798 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Wed, 13 Nov 2024 09:13:00 -0500 Subject: [PATCH 35/45] update test --- integrations/jina/tests/test_reader_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index 673befa12..2c0b1fe82 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -8,6 +8,7 @@ os.environ["TEST_KEY"] = "test-api-key" + class TestJinaReaderConnector: @pytest.fixture def mock_session(self): From f89313073ef0645709d72526adc12846b23483ac Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Wed, 13 Nov 2024 09:17:09 -0500 Subject: [PATCH 36/45] update test --- integrations/jina/tests/test_reader_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index 2c0b1fe82..de7d0ae3b 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -35,7 +35,7 @@ def test_to_dict(self): assert init_params["mode"] == "SEARCH" assert init_params["json_response"] is True assert "api_key" in init_params - assert init_params["api_key"]["type"] == "secret" + assert init_params["api_key"]["type"] == "env_var" assert "value" in init_params["api_key"] def test_from_dict(self): From 4cdea7005a8c88ef27e1b5d3d9df44010eedc264 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Wed, 13 Nov 2024 09:20:58 -0500 Subject: [PATCH 37/45] update test --- integrations/jina/tests/test_reader_connector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index de7d0ae3b..1178924eb 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -36,7 +36,6 @@ def test_to_dict(self): assert init_params["json_response"] is True assert "api_key" in init_params assert init_params["api_key"]["type"] == "env_var" - assert "value" in init_params["api_key"] def test_from_dict(self): component_dict = { From 38fad06675fb966998535a55d74abbee3a8cfdb3 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Wed, 13 Nov 2024 16:23:35 -0500 Subject: [PATCH 38/45] update test --- integrations/jina/tests/test_reader_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index 1178924eb..97f7a1381 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -43,7 +43,7 @@ def test_from_dict(self): "init_parameters": { "mode": "GROUND", "json_response": False, - "api_key": Secret.from_env_var("TEST_KEY"), + "api_key": {"type": "env_var", "env_vars": ["TEST_KEY"]}, }, } From 915aad9ec11f826f6b67078e1b7e1c396f04c759 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Wed, 13 Nov 2024 16:41:16 -0500 Subject: [PATCH 39/45] update test --- .../components/connectors/jina/reader.py | 4 ++-- integrations/jina/tests/test_reader_connector.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py index d64b727da..3296ccc31 100644 --- a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py @@ -24,9 +24,9 @@ class JinaReaderConnector: Usage example: ```python from haystack import Document - from haystack_integrations.components.connectors.jina import JinaReader + from haystack_integrations.components.connectors.jina import JinaReaderConnector - reader = JinaReader(mode="READ") + reader = JinaReaderConnector(mode="READ") query = "https://example.com" result = reader.run(query=query) document = result["document"] diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index 97f7a1381..caa95099c 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -6,6 +6,7 @@ from haystack_integrations.components.connectors.jina import JinaReaderConnector, JinaReaderMode +os.environ["JINA_API_KEY"] = "test-api-key" os.environ["TEST_KEY"] = "test-api-key" @@ -41,15 +42,15 @@ def test_from_dict(self): component_dict = { "type": "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector", "init_parameters": { - "mode": "GROUND", - "json_response": False, - "api_key": {"type": "env_var", "env_vars": ["TEST_KEY"]}, + "api_key": {"type": "env_var", "env_vars": ["JINA_API_KEY"]}, + "mode": "READ", + "json_response": True, }, } reader = JinaReaderConnector.from_dict(component_dict) assert isinstance(reader, JinaReaderConnector) - assert reader.mode == JinaReaderMode.GROUND - assert reader.json_response is False + assert reader.mode == JinaReaderMode.READ + assert reader.json_response is True assert reader.api_key.resolve_value() == "test-api-key" From 86e8bdbf056b1652e1b5f469b91b1e070bc7ae4c Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Wed, 13 Nov 2024 16:46:13 -0500 Subject: [PATCH 40/45] update test --- integrations/jina/tests/test_reader_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index caa95099c..aa317871f 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -42,7 +42,7 @@ def test_from_dict(self): component_dict = { "type": "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector", "init_parameters": { - "api_key": {"type": "env_var", "env_vars": ["JINA_API_KEY"]}, + "api_key": {"type": "env_var", "env_vars": ["JINA_API_KEY"], "strict": True}, "mode": "READ", "json_response": True, }, From 8a61e3b91d1dc449f87005b31e96e6ead80d07f3 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Mon, 18 Nov 2024 17:17:04 -0500 Subject: [PATCH 41/45] update test --- .../jina/tests/test_reader_connector.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index aa317871f..e3f5951ba 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -1,7 +1,8 @@ +import json import os from unittest.mock import patch -import pytest +from haystack import Document from haystack.utils import Secret from haystack_integrations.components.connectors.jina import JinaReaderConnector, JinaReaderMode @@ -11,11 +12,6 @@ class TestJinaReaderConnector: - @pytest.fixture - def mock_session(self): - with patch("requests.Session") as mock: - yield mock.return_value - def test_init_with_custom_parameters(self): reader = JinaReaderConnector(mode="READ", api_key=Secret.from_env_var("TEST_KEY"), json_response=False) @@ -54,3 +50,23 @@ def test_from_dict(self): assert reader.mode == JinaReaderMode.READ assert reader.json_response is True assert reader.api_key.resolve_value() == "test-api-key" + + @patch("requests.Session") + def test_run_with_mocked_response(self, mock_session): + mock_json_response = { + "data": {"content": "Mocked content", "title": "Mocked Title", "url": "https://example.com"} + } + + mock_response = mock_session.return_value.get.return_value + mock_response.content = json.dumps(mock_json_response).encode("utf-8") + mock_response.headers = {"Content-Type": "application/json"} + + reader = JinaReaderConnector(mode="READ") + result = reader.run(query="https://example.com") + + assert len(result) == 1 + document = result[0] + assert isinstance(document, Document) + assert document.content == "Mocked content" + assert document.meta["title"] == "Mocked Title" + assert document.meta["url"] == "https://example.com" From 5913862007e3c9a62b6f1c87dcc0ac606f63d618 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Nov 2024 17:45:44 +0100 Subject: [PATCH 42/45] refactoring + more tests --- integrations/jina/pyproject.toml | 5 +- .../components/connectors/jina/reader.py | 96 +++++++++------- .../components/connectors/jina/reader_mode.py | 13 +-- .../jina/tests/test_reader_connector.py | 107 ++++++++++++++---- 4 files changed, 152 insertions(+), 69 deletions(-) diff --git a/integrations/jina/pyproject.toml b/integrations/jina/pyproject.toml index c89eeacb4..ab8cf505f 100644 --- a/integrations/jina/pyproject.toml +++ b/integrations/jina/pyproject.toml @@ -138,12 +138,15 @@ source = ["haystack_integrations"] branch = true parallel = false - [tool.coverage.report] omit = ["*/tests/*", "*/__init__.py"] show_missing = true exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] +[tool.pytest.ini_options] +minversion = "6.0" +markers = ["unit: unit tests", "integration: integration tests"] + [[tool.mypy.overrides]] module = ["haystack.*", "haystack_integrations.*", "pytest.*"] ignore_missing_imports = true diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py index 3296ccc31..eb53329f7 100644 --- a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py @@ -1,11 +1,10 @@ -##### Haystack Implementation ###### # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 import json -import urllib from typing import Any, Dict, List, Optional, Union +from urllib.parse import quote import requests from haystack import Document, component, default_from_dict, default_to_dict @@ -13,24 +12,31 @@ from .reader_mode import JinaReaderMode +READER_ENDPOINT_URL_BY_MODE = { + JinaReaderMode.READ: "https://r.jina.ai/", + JinaReaderMode.SEARCH: "https://s.jina.ai/", + JinaReaderMode.GROUND: "https://g.jina.ai/", +} + @component class JinaReaderConnector: """ A component that interacts with Jina AI's reader service to process queries and return documents. - This component supports different modes of operation: READ, SEARCH, and GROUND. + This component supports different modes of operation: `read`, `search`, and `ground`. Usage example: ```python - from haystack import Document from haystack_integrations.components.connectors.jina import JinaReaderConnector - reader = JinaReaderConnector(mode="READ") + reader = JinaReaderConnector(mode="read") query = "https://example.com" result = reader.run(query=query) - document = result["document"] + document = result["documents"][0] print(document.content) + + >>> "This domain is for use in illustrative examples..." ``` """ @@ -43,28 +49,23 @@ def __init__( """ Initialize a JinaReader instance. - :param mode: The operation mode for the reader (READ, SEARCH, or GROUND). See each Mode and its function here https://jina.ai/reader/ + :param mode: The operation mode for the reader (`read`, `search` or `ground`). + - `read`: process a URL and return the textual content of the page. + - `search`: search the web and return textual content of the most relevant pages. + - `ground`: call the grounding engine to perform fact checking. + For more information on the modes, see the [Jina Reader documentation](https://jina.ai/reader/). :param api_key: The Jina API key. It can be explicitly provided or automatically read from the environment variable JINA_API_KEY (recommended). - :param json_response: A boolean to indicate whether to return the response in JSON format. Default is True. - If set to False, the response will be in markdown format. + :param json_response: Controls the response format from the Jina Reader API. + If `True`, requests a JSON response, resulting in Documents with rich structured metadata. + If `False`, requests a raw response, resulting in one Document with minimal metadata. """ - resolved_api_key = api_key.resolve_value() self.api_key = api_key - self._session = requests.Session() self.json_response = json_response - self._session.headers.update( - { - "Authorization": f"Bearer {resolved_api_key}", - "Accept-Encoding": "identity", - } - ) - - if self.json_response: - self._session.headers.update({"Accept": "application/json"}) if isinstance(mode, str): - self.mode = JinaReaderMode.from_str(mode) + mode = JinaReaderMode.from_str(mode) + self.mode = mode def to_dict(self) -> Dict[str, Any]: """ @@ -91,7 +92,10 @@ def from_dict(cls, data: Dict[str, Any]) -> "JinaReaderConnector": deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) - def parse_json_response(self, data: dict) -> Document: + def _json_to_document(self, data: dict) -> Document: + """ + Convert a JSON response/record to a Document, depending on the reader mode. + """ if self.mode == JinaReaderMode.GROUND: content = data.pop("reason") else: @@ -102,28 +106,36 @@ def parse_json_response(self, data: dict) -> Document: @component.output_types(document=List[Document]) def run(self, query: str, headers: Optional[Dict[str, str]] = None): """ - Process the query using the Jina AI reader service. + Process the query/URL using the Jina AI reader service. :param query: The query string or URL to process. - :param headers: Optional headers to include in the request. - :returns: A list containing a single Document object with the processed content and metadata. + :param headers: Optional headers to include in the request for customization. Refer to the + [Jina Reader documentation](https://jina.ai/reader/) for more information. + + :returns: + A dictionary with the following keys: + - `documents`: A list of `Document` objects. """ - if headers: - self._session.headers.update(headers) - mode_map = {JinaReaderMode.READ: "r", JinaReaderMode.SEARCH: "s", JinaReaderMode.GROUND: "g"} - mode = mode_map[self.mode] - base_url = f"https://{mode}.jina.ai/" - encoded_target = urllib.parse.quote(query, safe="") - url = f"{base_url}{encoded_target}" - response = self._session.get(url) + headers = headers or {} + headers["Authorization"] = f"Bearer {self.api_key.resolve_value()}" if self.json_response: - response_json = json.loads(response.content).get("data", {}) - if self.mode == "SEARCH": - documents = [self.parse_json_response(record) for record in response_json] - return documents - return [self.parse_json_response(response_json)] - else: - metadata = {"content_type": response.headers["Content-Type"], "query": query} - documents = [Document(content=response.content, meta=metadata)] - return documents + headers["Accept"] = "application/json" + + endpoint_url = READER_ENDPOINT_URL_BY_MODE[self.mode] + encoded_target = quote(query, safe="") + url = f"{endpoint_url}{encoded_target}" + + response = requests.get(url, headers=headers, timeout=60) + + # raw response: we just return a single Document with text + if not self.json_response: + meta = {"content_type": response.headers["Content-Type"], "query": query} + return {"documents": [Document(content=response.content, meta=meta)]} + + response_json = json.loads(response.content).get("data", {}) + if self.mode == JinaReaderMode.SEARCH: + documents = [self._json_to_document(record) for record in response_json] + return {"documents": documents} + + return {"documents": [self._json_to_document(response_json)]} diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py index 6dff4cac3..2ccf7250b 100644 --- a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py @@ -1,4 +1,3 @@ -##### Haystack Implementation ###### # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 @@ -10,15 +9,15 @@ class JinaReaderMode(Enum): Enum representing modes for the Jina Reader. Modes: - READ: For reading documents. - SEARCH: For searching within documents. - GROUND: For grounding or fact checking. + READ: Process a URL and return the textual content of the page. + SEARCH: Search the web and return the textual content of the most relevant pages. + GROUND: Call the grounding engine to perform fact checking. """ - READ = "READ" - SEARCH = "SEARCH" - GROUND = "GROUND" + READ = "read" + SEARCH = "search" + GROUND = "ground" def __str__(self): return self.value diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py index e3f5951ba..449f73df8 100644 --- a/integrations/jina/tests/test_reader_connector.py +++ b/integrations/jina/tests/test_reader_connector.py @@ -2,26 +2,29 @@ import os from unittest.mock import patch +import pytest from haystack import Document from haystack.utils import Secret from haystack_integrations.components.connectors.jina import JinaReaderConnector, JinaReaderMode -os.environ["JINA_API_KEY"] = "test-api-key" -os.environ["TEST_KEY"] = "test-api-key" - class TestJinaReaderConnector: - def test_init_with_custom_parameters(self): - reader = JinaReaderConnector(mode="READ", api_key=Secret.from_env_var("TEST_KEY"), json_response=False) + def test_init_with_custom_parameters(self, monkeypatch): + monkeypatch.setenv("TEST_KEY", "test-api-key") + reader = JinaReaderConnector(mode="read", api_key=Secret.from_env_var("TEST_KEY"), json_response=False) assert reader.mode == JinaReaderMode.READ assert reader.api_key.resolve_value() == "test-api-key" assert reader.json_response is False - assert "application/json" not in reader._session.headers.values() - def test_to_dict(self): - reader = JinaReaderConnector(mode="SEARCH", api_key=Secret.from_env_var("TEST_KEY"), json_response=True) + def test_init_with_invalid_mode(self): + with pytest.raises(ValueError): + JinaReaderConnector(mode="INVALID") + + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("TEST_KEY", "test-api-key") + reader = JinaReaderConnector(mode="search", api_key=Secret.from_env_var("TEST_KEY"), json_response=True) serialized = reader.to_dict() @@ -29,17 +32,18 @@ def test_to_dict(self): assert "init_parameters" in serialized init_params = serialized["init_parameters"] - assert init_params["mode"] == "SEARCH" + assert init_params["mode"] == "search" assert init_params["json_response"] is True assert "api_key" in init_params assert init_params["api_key"]["type"] == "env_var" - def test_from_dict(self): + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("JINA_API_KEY", "test-api-key") component_dict = { "type": "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector", "init_parameters": { "api_key": {"type": "env_var", "env_vars": ["JINA_API_KEY"], "strict": True}, - "mode": "READ", + "mode": "read", "json_response": True, }, } @@ -51,22 +55,87 @@ def test_from_dict(self): assert reader.json_response is True assert reader.api_key.resolve_value() == "test-api-key" - @patch("requests.Session") - def test_run_with_mocked_response(self, mock_session): + def test_json_to_document_read_mode(self, monkeypatch): + monkeypatch.setenv("TEST_KEY", "test-api-key") + reader = JinaReaderConnector(mode="read") + + data = {"content": "Mocked content", "title": "Mocked Title", "url": "https://example.com"} + document = reader._json_to_document(data) + + assert isinstance(document, Document) + assert document.content == "Mocked content" + assert document.meta["title"] == "Mocked Title" + assert document.meta["url"] == "https://example.com" + + def test_json_to_document_ground_mode(self, monkeypatch): + monkeypatch.setenv("TEST_KEY", "test-api-key") + reader = JinaReaderConnector(mode="ground") + + data = { + "factuality": 0, + "result": False, + "reason": "The statement is contradicted by...", + "references": [{"url": "https://example.com", "keyQuote": "Mocked key quote", "isSupportive": False}], + } + + document = reader._json_to_document(data) + assert isinstance(document, Document) + assert document.content == "The statement is contradicted by..." + assert document.meta["factuality"] == 0 + assert document.meta["result"] is False + assert document.meta["references"] == [ + {"url": "https://example.com", "keyQuote": "Mocked key quote", "isSupportive": False} + ] + + @patch("requests.get") + def test_run_with_mocked_response(self, mock_get, monkeypatch): + monkeypatch.setenv("JINA_API_KEY", "test-api-key") mock_json_response = { "data": {"content": "Mocked content", "title": "Mocked Title", "url": "https://example.com"} } + mock_get.return_value.content = json.dumps(mock_json_response).encode("utf-8") + mock_get.return_value.headers = {"Content-Type": "application/json"} - mock_response = mock_session.return_value.get.return_value - mock_response.content = json.dumps(mock_json_response).encode("utf-8") - mock_response.headers = {"Content-Type": "application/json"} - - reader = JinaReaderConnector(mode="READ") + reader = JinaReaderConnector(mode="read") result = reader.run(query="https://example.com") + assert mock_get.call_count == 1 + assert mock_get.call_args[0][0] == "https://r.jina.ai/https%3A%2F%2Fexample.com" + assert mock_get.call_args[1]["headers"] == { + "Authorization": "Bearer test-api-key", + "Accept": "application/json", + } + assert len(result) == 1 - document = result[0] + document = result["documents"][0] assert isinstance(document, Document) assert document.content == "Mocked content" assert document.meta["title"] == "Mocked Title" assert document.meta["url"] == "https://example.com" + + @pytest.mark.skipif(not os.environ.get("JINA_API_KEY", None), reason="JINA_API_KEY env var not set") + @pytest.mark.integration + def test_run_reader_mode(self): + reader = JinaReaderConnector(mode="read") + result = reader.run(query="https://example.com") + + assert len(result) == 1 + document = result["documents"][0] + assert isinstance(document, Document) + assert "This domain is for use in illustrative examples" in document.content + assert document.meta["title"] == "Example Domain" + assert document.meta["url"] == "https://example.com/" + + @pytest.mark.skipif(not os.environ.get("JINA_API_KEY", None), reason="JINA_API_KEY env var not set") + @pytest.mark.integration + def test_run_search_mode(self): + reader = JinaReaderConnector(mode="search") + result = reader.run(query="When was Jina AI founded?") + + assert len(result) >= 1 + for doc in result["documents"]: + assert isinstance(doc, Document) + assert doc.content + assert "title" in doc.meta + assert "url" in doc.meta + assert "description" in doc.meta From c169146ecf748bd8752da39f81a6b780612452d1 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Nov 2024 17:45:53 +0100 Subject: [PATCH 43/45] example --- .../jina/examples/jina_reader_connector.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 integrations/jina/examples/jina_reader_connector.py diff --git a/integrations/jina/examples/jina_reader_connector.py b/integrations/jina/examples/jina_reader_connector.py new file mode 100644 index 000000000..24b6f5db3 --- /dev/null +++ b/integrations/jina/examples/jina_reader_connector.py @@ -0,0 +1,47 @@ +# to make use of the JinaReaderConnector, we first need to install the Haystack integration +# pip install jina-haystack + +# then we must set the JINA_API_KEY environment variable +# export JINA_API_KEY= + + +from haystack_integrations.components.connectors.jina import JinaReaderConnector + +# we can use the JinaReaderConnector to process a URL and return the textual content of the page +reader = JinaReaderConnector(mode="read") +query = "https://example.com" +result = reader.run(query=query) + +print(result) +# {'documents': [Document(id=fa3e51e4ca91828086dca4f359b6e1ea2881e358f83b41b53c84616cb0b2f7cf, +# content: 'This domain is for use in illustrative examples in documents. You may use this domain in literature ...', +# meta: {'title': 'Example Domain', 'description': '', 'url': 'https://example.com/', 'usage': {'tokens': 42}})]} + + +# we can perform a web search by setting the mode to "search" +reader = JinaReaderConnector(mode="search") +query = "UEFA Champions League 2024" +result = reader.run(query=query) + +print(result) +# {'documents': Document(id=6a71abf9955594232037321a476d39a835c0cb7bc575d886ee0087c973c95940, +# content: '2024/25 UEFA Champions League: Matches, draw, final, key dates | UEFA Champions League | UEFA.com...', +# meta: {'title': '2024/25 UEFA Champions League: Matches, draw, final, key dates', +# 'description': 'What are the match dates? Where is the 2025 final? How will the competition work?', +# 'url': 'https://www.uefa.com/uefachampionsleague/news/...', +# 'usage': {'tokens': 5581}}), ...]} + + +# finally, we can perform fact-checking by setting the mode to "ground" (experimental) +reader = JinaReaderConnector(mode="ground") +query = "ChatGPT was launched in 2017" +result = reader.run(query=query) + +print(result) +# {'documents': [Document(id=f0c964dbc1ebb2d6584c8032b657150b9aa6e421f714cc1b9f8093a159127f0c, +# content: 'The statement that ChatGPT was launched in 2017 is incorrect. Multiple references confirm that ChatG...', +# meta: {'factuality': 0, 'result': False, 'references': [ +# {'url': 'https://en.wikipedia.org/wiki/ChatGPT', +# 'keyQuote': 'ChatGPT is a generative artificial intelligence (AI) chatbot developed by OpenAI and launched in 2022.', +# 'isSupportive': False}, ...], +# 'usage': {'tokens': 10188}})]} From 72ed31ccbeae73899a3dab1b4b930e86973afceb Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Nov 2024 17:47:36 +0100 Subject: [PATCH 44/45] pydoc config --- integrations/jina/pydoc/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/jina/pydoc/config.yml b/integrations/jina/pydoc/config.yml index 8c7a241f6..2d0ef4f87 100644 --- a/integrations/jina/pydoc/config.yml +++ b/integrations/jina/pydoc/config.yml @@ -6,6 +6,7 @@ loaders: "haystack_integrations.components.embedders.jina.document_embedder", "haystack_integrations.components.embedders.jina.text_embedder", "haystack_integrations.components.rankers.jina.ranker", + "haystack_integrations.components.connectors.jina.reader", ] ignore_when_discovered: ["__init__"] processors: From 4f48f08066cf2bdb812a1583db49690d8945350f Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Nov 2024 17:54:44 +0100 Subject: [PATCH 45/45] examples can contain print --- integrations/jina/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integrations/jina/pyproject.toml b/integrations/jina/pyproject.toml index ab8cf505f..e3af086d0 100644 --- a/integrations/jina/pyproject.toml +++ b/integrations/jina/pyproject.toml @@ -132,6 +132,8 @@ ban-relative-imports = "parents" [tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] +# examples can contain "print" commands +"examples/**/*" = ["T201"] [tool.coverage.run] source = ["haystack_integrations"]