Skip to content

Commit

Permalink
feat: add create_index option to OpenSearchDocumentStore (#840)
Browse files Browse the repository at this point in the history
* [opensearch] feat: add create_index option

* fix lint

* fix lint

* add create_index() method

* fix lint

* better match

* fix docs
  • Loading branch information
tstadel authored Jun 25, 2024
1 parent 1c557cb commit be09adf
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(
method: Optional[Dict[str, Any]] = None,
mappings: Optional[Dict[str, Any]] = None,
settings: Optional[Dict[str, Any]] = DEFAULT_SETTINGS,
create_index: bool = True,
**kwargs,
):
"""
Expand All @@ -67,6 +68,7 @@ def __init__(
Defaults to None
:param settings: The settings of the index to be created. Please see the [official OpenSearch docs](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#index-settings)
for more information. Defaults to {"index.knn": True}
:param create_index: Whether to create the index if it doesn't exist. Defaults to True
:param **kwargs: Optional arguments that ``OpenSearch`` takes. For the full list of supported kwargs,
see the [official OpenSearch reference](https://opensearch-project.github.io/opensearch-py/api-ref/clients/opensearch_client.html)
"""
Expand All @@ -79,6 +81,7 @@ def __init__(
self._method = method
self._mappings = mappings or self._get_default_mappings()
self._settings = settings
self._create_index = create_index
self._kwargs = kwargs

def _get_default_mappings(self) -> Dict[str, Any]:
Expand Down Expand Up @@ -113,13 +116,39 @@ def client(self) -> OpenSearch:
"`settings` values will be ignored.",
self._index,
)
else:
elif self._create_index:
# Create the index if it doesn't exist
body = {"mappings": self._mappings, "settings": self._settings}

self._client.indices.create(index=self._index, body=body) # type:ignore
return self._client

def create_index(
self,
index: Optional[str] = None,
mappings: Optional[Dict[str, Any]] = None,
settings: Optional[Dict[str, Any]] = None,
) -> None:
"""
Creates an index in OpenSearch.
Note that this method ignores the `create_index` argument from the constructor.
:param index: Name of the index to create. If None, the index name from the constructor is used.
:param mappings: The mapping of how the documents are stored and indexed. Please see the [official OpenSearch docs](https://opensearch.org/docs/latest/field-types/)
for more information. If None, the mappings from the constructor are used.
:param settings: The settings of the index to be created. Please see the [official OpenSearch docs](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#index-settings)
for more information. If None, the settings from the constructor are used.
"""
if not index:
index = self._index
if not mappings:
mappings = self._mappings
if not settings:
settings = self._settings

if not self.client.indices.exists(index=index):
self.client.indices.create(index=index, body={"mappings": mappings, "settings": settings})

def to_dict(self) -> Dict[str, Any]:
# This is not the best solution to serialise this class but is the fastest to implement.
# Not all kwargs types can be serialised to text so this can fail. We must serialise each
Expand All @@ -139,6 +168,8 @@ def to_dict(self) -> Dict[str, Any]:
method=self._method,
mappings=self._mappings,
settings=self._settings,
create_index=self._create_index,
return_embedding=self._return_embedding,
**self._kwargs,
)

Expand Down
2 changes: 2 additions & 0 deletions integrations/opensearch/tests/test_bm25_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def test_to_dict(_mock_opensearch_client):
"max_chunk_bytes": DEFAULT_MAX_CHUNK_BYTES,
"method": None,
"settings": {"index.knn": True},
"return_embedding": False,
"create_index": True,
},
"type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore",
},
Expand Down
46 changes: 45 additions & 1 deletion integrations/opensearch/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def test_to_dict(_mock_opensearch_client):
"max_chunk_bytes": DEFAULT_MAX_CHUNK_BYTES,
"method": None,
"settings": {"index.knn": True},
"return_embedding": False,
"create_index": True,
},
}

Expand All @@ -43,7 +45,14 @@ def test_to_dict(_mock_opensearch_client):
def test_from_dict(_mock_opensearch_client):
data = {
"type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore",
"init_parameters": {"hosts": "some hosts", "index": "default", "max_chunk_bytes": 1000, "embedding_dim": 1536},
"init_parameters": {
"hosts": "some hosts",
"index": "default",
"max_chunk_bytes": 1000,
"embedding_dim": 1536,
"create_index": False,
"return_embedding": True,
},
}
document_store = OpenSearchDocumentStore.from_dict(data)
assert document_store._hosts == "some hosts"
Expand All @@ -66,6 +75,8 @@ def test_from_dict(_mock_opensearch_client):
],
}
assert document_store._settings == {"index.knn": True}
assert document_store._return_embedding is True
assert document_store._create_index is False


@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
Expand Down Expand Up @@ -113,6 +124,30 @@ def document_store(self, request):
yield store
store.client.indices.delete(index=index, params={"ignore": [400, 404]})

@pytest.fixture
def document_store_readonly(self, request):
"""
This is the most basic requirement for the child class: provide
an instance of this document store so the base class can use it.
"""
hosts = ["https://localhost:9200"]
# Use a different index for each test so we can run them in parallel
index = f"{request.node.name}"

store = OpenSearchDocumentStore(
hosts=hosts,
index=index,
http_auth=("admin", "admin"),
verify_certs=False,
embedding_dim=768,
method={"space_type": "cosinesimil", "engine": "nmslib", "name": "hnsw"},
create_index=False,
)
store.client.cluster.put_settings(body={"transient": {"action.auto_create_index": False}})
yield store
store.client.cluster.put_settings(body={"transient": {"action.auto_create_index": True}})
store.client.indices.delete(index=index, params={"ignore": [400, 404]})

@pytest.fixture
def document_store_embedding_dim_4(self, request):
"""
Expand Down Expand Up @@ -165,6 +200,15 @@ def test_write_documents(self, document_store: OpenSearchDocumentStore):
with pytest.raises(DuplicateDocumentError):
document_store.write_documents(docs, DuplicatePolicy.FAIL)

def test_write_documents_readonly(self, document_store_readonly: OpenSearchDocumentStore):
docs = [Document(id="1")]
with pytest.raises(DocumentStoreError, match="index_not_found_exception"):
document_store_readonly.write_documents(docs)

def test_create_index(self, document_store_readonly: OpenSearchDocumentStore):
document_store_readonly.create_index()
assert document_store_readonly.client.indices.exists(index=document_store_readonly._index)

def test_bm25_retrieval(self, document_store: OpenSearchDocumentStore):
document_store.write_documents(
[
Expand Down
2 changes: 2 additions & 0 deletions integrations/opensearch/tests/test_embedding_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ def test_to_dict(_mock_opensearch_client):
"settings": {
"index.knn": True,
},
"return_embedding": False,
"create_index": True,
},
"type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore",
},
Expand Down

0 comments on commit be09adf

Please sign in to comment.