diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 24e7224f8..6f7ef3f93 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -43,6 +43,7 @@ def __init__( method: Optional[Dict[str, Any]] = None, mappings: Optional[Dict[str, Any]] = None, settings: Optional[Dict[str, Any]] = DEFAULT_SETTINGS, + create_index: bool = True, **kwargs, ): """ @@ -67,6 +68,7 @@ def __init__( Defaults to None :param settings: The settings of the index to be created. Please see the [official OpenSearch docs](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#index-settings) for more information. Defaults to {"index.knn": True} + :param create_index: Whether to create the index if it doesn't exist. Defaults to True :param **kwargs: Optional arguments that ``OpenSearch`` takes. For the full list of supported kwargs, see the [official OpenSearch reference](https://opensearch-project.github.io/opensearch-py/api-ref/clients/opensearch_client.html) """ @@ -79,6 +81,7 @@ def __init__( self._method = method self._mappings = mappings or self._get_default_mappings() self._settings = settings + self._create_index = create_index self._kwargs = kwargs def _get_default_mappings(self) -> Dict[str, Any]: @@ -113,13 +116,39 @@ def client(self) -> OpenSearch: "`settings` values will be ignored.", self._index, ) - else: + elif self._create_index: # Create the index if it doesn't exist body = {"mappings": self._mappings, "settings": self._settings} - self._client.indices.create(index=self._index, body=body) # type:ignore return self._client + def create_index( + self, + index: Optional[str] = None, + mappings: Optional[Dict[str, Any]] = None, + settings: Optional[Dict[str, Any]] = None, + ) -> None: + """ + Creates an index in OpenSearch. + + Note that this method ignores the `create_index` argument from the constructor. + + :param index: Name of the index to create. If None, the index name from the constructor is used. + :param mappings: The mapping of how the documents are stored and indexed. Please see the [official OpenSearch docs](https://opensearch.org/docs/latest/field-types/) + for more information. If None, the mappings from the constructor are used. + :param settings: The settings of the index to be created. Please see the [official OpenSearch docs](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#index-settings) + for more information. If None, the settings from the constructor are used. + """ + if not index: + index = self._index + if not mappings: + mappings = self._mappings + if not settings: + settings = self._settings + + if not self.client.indices.exists(index=index): + self.client.indices.create(index=index, body={"mappings": mappings, "settings": settings}) + def to_dict(self) -> Dict[str, Any]: # This is not the best solution to serialise this class but is the fastest to implement. # Not all kwargs types can be serialised to text so this can fail. We must serialise each @@ -139,6 +168,8 @@ def to_dict(self) -> Dict[str, Any]: method=self._method, mappings=self._mappings, settings=self._settings, + create_index=self._create_index, + return_embedding=self._return_embedding, **self._kwargs, ) diff --git a/integrations/opensearch/tests/test_bm25_retriever.py b/integrations/opensearch/tests/test_bm25_retriever.py index 71fc19c6a..4242386f0 100644 --- a/integrations/opensearch/tests/test_bm25_retriever.py +++ b/integrations/opensearch/tests/test_bm25_retriever.py @@ -43,6 +43,8 @@ def test_to_dict(_mock_opensearch_client): "max_chunk_bytes": DEFAULT_MAX_CHUNK_BYTES, "method": None, "settings": {"index.knn": True}, + "return_embedding": False, + "create_index": True, }, "type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore", }, diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index af8fd8e25..4b7e242f2 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -35,6 +35,8 @@ def test_to_dict(_mock_opensearch_client): "max_chunk_bytes": DEFAULT_MAX_CHUNK_BYTES, "method": None, "settings": {"index.knn": True}, + "return_embedding": False, + "create_index": True, }, } @@ -43,7 +45,14 @@ def test_to_dict(_mock_opensearch_client): def test_from_dict(_mock_opensearch_client): data = { "type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore", - "init_parameters": {"hosts": "some hosts", "index": "default", "max_chunk_bytes": 1000, "embedding_dim": 1536}, + "init_parameters": { + "hosts": "some hosts", + "index": "default", + "max_chunk_bytes": 1000, + "embedding_dim": 1536, + "create_index": False, + "return_embedding": True, + }, } document_store = OpenSearchDocumentStore.from_dict(data) assert document_store._hosts == "some hosts" @@ -66,6 +75,8 @@ def test_from_dict(_mock_opensearch_client): ], } assert document_store._settings == {"index.knn": True} + assert document_store._return_embedding is True + assert document_store._create_index is False @patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") @@ -113,6 +124,30 @@ def document_store(self, request): yield store store.client.indices.delete(index=index, params={"ignore": [400, 404]}) + @pytest.fixture + def document_store_readonly(self, request): + """ + This is the most basic requirement for the child class: provide + an instance of this document store so the base class can use it. + """ + hosts = ["https://localhost:9200"] + # Use a different index for each test so we can run them in parallel + index = f"{request.node.name}" + + store = OpenSearchDocumentStore( + hosts=hosts, + index=index, + http_auth=("admin", "admin"), + verify_certs=False, + embedding_dim=768, + method={"space_type": "cosinesimil", "engine": "nmslib", "name": "hnsw"}, + create_index=False, + ) + store.client.cluster.put_settings(body={"transient": {"action.auto_create_index": False}}) + yield store + store.client.cluster.put_settings(body={"transient": {"action.auto_create_index": True}}) + store.client.indices.delete(index=index, params={"ignore": [400, 404]}) + @pytest.fixture def document_store_embedding_dim_4(self, request): """ @@ -165,6 +200,15 @@ def test_write_documents(self, document_store: OpenSearchDocumentStore): with pytest.raises(DuplicateDocumentError): document_store.write_documents(docs, DuplicatePolicy.FAIL) + def test_write_documents_readonly(self, document_store_readonly: OpenSearchDocumentStore): + docs = [Document(id="1")] + with pytest.raises(DocumentStoreError, match="index_not_found_exception"): + document_store_readonly.write_documents(docs) + + def test_create_index(self, document_store_readonly: OpenSearchDocumentStore): + document_store_readonly.create_index() + assert document_store_readonly.client.indices.exists(index=document_store_readonly._index) + def test_bm25_retrieval(self, document_store: OpenSearchDocumentStore): document_store.write_documents( [ diff --git a/integrations/opensearch/tests/test_embedding_retriever.py b/integrations/opensearch/tests/test_embedding_retriever.py index c1015ca33..7bf6c09eb 100644 --- a/integrations/opensearch/tests/test_embedding_retriever.py +++ b/integrations/opensearch/tests/test_embedding_retriever.py @@ -58,6 +58,8 @@ def test_to_dict(_mock_opensearch_client): "settings": { "index.knn": True, }, + "return_embedding": False, + "create_index": True, }, "type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore", },