Skip to content

Commit

Permalink
Adopt Secret to pgvector (#402)
Browse files Browse the repository at this point in the history
* initial import

* adding Secret support and fixing tests

* completing docs

* code formating

* linting and typing

* fixing tests

* adding custom from_dict

* adding test coverage

* use deserialize_secrets_inplace()
  • Loading branch information
davidsbatista authored Feb 14, 2024
1 parent 3fad8ca commit 613e4ec
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 30 deletions.
13 changes: 12 additions & 1 deletion integrations/pgvector/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,23 @@ pip install pgvector-haystack

## Testing

TODO
Ensure that you have a PostgreSQL running with the `pgvector` extension. For a quick setup using Docker, run:
```
docker run -d -p 5432:5432 -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=postgres ankane/pgvector
```

then run the tests:

```console
hatch run test
```

To run the coverage report:

```console
hatch run cov
```

## License

`pgvector-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
4 changes: 3 additions & 1 deletion integrations/pgvector/examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# git clone https://github.com/anakin87/neural-search-pills

import glob
import os

from haystack import Pipeline
from haystack.components.converters import MarkdownToDocument
Expand All @@ -20,9 +21,10 @@
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore

os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres"

# Initialize PgvectorDocumentStore
document_store = PgvectorDocumentStore(
connection_string="postgresql://postgres:postgres@localhost:5432/postgres",
table_name="haystack_test",
embedding_dimension=768,
vector_function="cosine_similarity",
Expand Down
13 changes: 7 additions & 6 deletions integrations/pgvector/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ ignore = [
"S105", "S106", "S107",
# Ignore complexity
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
# ignore function-call-in-default-argument
"B008",
]
unfixable = [
# Don't touch unused imports
Expand All @@ -156,23 +158,22 @@ ban-relative-imports = "parents"
# examples can contain "print" commands
"examples/**/*" = ["T201"]


[tool.coverage.run]
source_pkgs = ["src", "tests"]
source = ["haystack_integrations"]
branch = true
parallel = true


[tool.coverage.paths]
weaviate_haystack = ["src/haystack_integrations", "*/pgvector-haystack/src"]
tests = ["tests", "*/pgvector-haystack/tests"]

[tool.coverage.report]
omit = ["*/tests/*", "*/__init__.py"]
show_missing=true
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]


[[tool.mypy.overrides]]
module = [
"haystack.*",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,8 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PgvectorEmbeddingRetriever":
data["init_parameters"]["document_store"] = default_from_dict(
PgvectorDocumentStore, data["init_parameters"]["document_store"]
)
doc_store_params = data["init_parameters"]["document_store"]
data["init_parameters"]["document_store"] = PgvectorDocumentStore.from_dict(doc_store_params)
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import logging
from typing import Any, Dict, List, Literal, Optional

from haystack import default_to_dict
from haystack import default_from_dict, default_to_dict
from haystack.dataclasses.document import ByteStream, Document
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
from haystack.document_stores.types import DuplicatePolicy
from haystack.utils.auth import Secret, deserialize_secrets_inplace
from haystack.utils.filters import convert
from psycopg import Error, IntegrityError, connect
from psycopg.abc import Query
Expand Down Expand Up @@ -69,7 +70,7 @@ class PgvectorDocumentStore:
def __init__(
self,
*,
connection_string: str,
connection_string: Secret = Secret.from_env_var("PG_CONN_STR"),
table_name: str = "haystack_documents",
embedding_dimension: int = 768,
vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] = "cosine_similarity",
Expand All @@ -84,8 +85,8 @@ def __init__(
It is meant to be connected to a PostgreSQL database with the pgvector extension installed.
A specific table to store Haystack documents will be created if it doesn't exist yet.
:param connection_string: The connection string to use to connect to the PostgreSQL database.
e.g. "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
:param connection_string: The connection string to use to connect to the PostgreSQL database, defined as an
environment variable, e.g.: PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
:param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents".
:param embedding_dimension: The dimension of the embedding. Defaults to 768.
:param vector_function: The similarity function to use when searching for similar embeddings.
Expand Down Expand Up @@ -130,7 +131,7 @@ def __init__(
self.hnsw_index_creation_kwargs = hnsw_index_creation_kwargs or {}
self.hnsw_ef_search = hnsw_ef_search

connection = connect(connection_string)
connection = connect(self.connection_string.resolve_value())
connection.autocommit = True
self._connection = connection

Expand All @@ -151,7 +152,7 @@ def __init__(
def to_dict(self) -> Dict[str, Any]:
return default_to_dict(
self,
connection_string=self.connection_string,
connection_string=self.connection_string.to_dict(),
table_name=self.table_name,
embedding_dimension=self.embedding_dimension,
vector_function=self.vector_function,
Expand All @@ -162,6 +163,11 @@ def to_dict(self) -> Dict[str, Any]:
hnsw_ef_search=self.hnsw_ef_search,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PgvectorDocumentStore":
deserialize_secrets_inplace(data["init_parameters"], ["connection_string"])
return default_from_dict(cls, data)

def _execute_sql(
self, sql_query: Query, params: Optional[tuple] = None, error_msg: str = "", cursor: Optional[Cursor] = None
):
Expand Down Expand Up @@ -221,15 +227,15 @@ def _handle_hnsw(self):
)
self._execute_sql(sql_set_hnsw_ef_search, error_msg="Could not set hnsw.ef_search")

index_esists = bool(
index_exists = bool(
self._execute_sql(
"SELECT 1 FROM pg_indexes WHERE tablename = %s AND indexname = %s",
(self.table_name, HNSW_INDEX_NAME),
"Could not check if HNSW index exists",
).fetchone()
)

if index_esists and not self.hnsw_recreate_index_if_exists:
if index_exists and not self.hnsw_recreate_index_if_exists:
logger.warning(
"HNSW index already exists and won't be recreated. "
"If you want to recreate it, pass 'hnsw_recreate_index_if_exists=True' to the "
Expand Down Expand Up @@ -373,7 +379,8 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D

return written_docs

def _from_haystack_to_pg_documents(self, documents: List[Document]) -> List[Dict[str, Any]]:
@staticmethod
def _from_haystack_to_pg_documents(documents: List[Document]) -> List[Dict[str, Any]]:
"""
Internal method to convert a list of Haystack Documents to a list of dictionaries that can be used to insert
documents into the PgvectorDocumentStore.
Expand All @@ -395,7 +402,8 @@ def _from_haystack_to_pg_documents(self, documents: List[Document]) -> List[Dict

return db_documents

def _from_pg_to_haystack_documents(self, documents: List[Dict[str, Any]]) -> List[Document]:
@staticmethod
def _from_pg_to_haystack_documents(documents: List[Dict[str, Any]]) -> List[Document]:
"""
Internal method to convert a list of dictionaries from pgvector to a list of Haystack Documents.
"""
Expand Down
6 changes: 4 additions & 2 deletions integrations/pgvector/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
import os

import pytest
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore


@pytest.fixture
def document_store(request):
connection_string = "postgresql://postgres:postgres@localhost:5432/postgres"
os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres"
table_name = f"haystack_{request.node.name}"
embedding_dimension = 768
vector_function = "cosine_similarity"
recreate_table = True
search_strategy = "exact_nearest_neighbor"

store = PgvectorDocumentStore(
connection_string=connection_string,
table_name=table_name,
embedding_dimension=embedding_dimension,
vector_function=vector_function,
recreate_table=recreate_table,
search_strategy=search_strategy,
)

yield store

store.delete_table()
5 changes: 1 addition & 4 deletions integrations/pgvector/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def test_write_dataframe(self, document_store: PgvectorDocumentStore):

def test_init(self):
document_store = PgvectorDocumentStore(
connection_string="postgresql://postgres:postgres@localhost:5432/postgres",
table_name="my_table",
embedding_dimension=512,
vector_function="l2_distance",
Expand All @@ -52,7 +51,6 @@ def test_init(self):
hnsw_ef_search=50,
)

assert document_store.connection_string == "postgresql://postgres:postgres@localhost:5432/postgres"
assert document_store.table_name == "my_table"
assert document_store.embedding_dimension == 512
assert document_store.vector_function == "l2_distance"
Expand All @@ -64,7 +62,6 @@ def test_init(self):

def test_to_dict(self):
document_store = PgvectorDocumentStore(
connection_string="postgresql://postgres:postgres@localhost:5432/postgres",
table_name="my_table",
embedding_dimension=512,
vector_function="l2_distance",
Expand All @@ -78,7 +75,7 @@ def test_to_dict(self):
assert document_store.to_dict() == {
"type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore",
"init_parameters": {
"connection_string": "postgresql://postgres:postgres@localhost:5432/postgres",
"connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"},
"table_name": "my_table",
"embedding_dimension": 512,
"vector_function": "l2_distance",
Expand Down
7 changes: 4 additions & 3 deletions integrations/pgvector/tests/test_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from unittest.mock import Mock

from haystack.dataclasses import Document
from haystack.utils.auth import EnvVarSecret
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore

Expand Down Expand Up @@ -37,7 +38,7 @@ def test_to_dict(self, document_store: PgvectorDocumentStore):
"document_store": {
"type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore",
"init_parameters": {
"connection_string": "postgresql://postgres:postgres@localhost:5432/postgres",
"connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"},
"table_name": "haystack_test_to_dict",
"embedding_dimension": 768,
"vector_function": "cosine_similarity",
Expand All @@ -62,7 +63,7 @@ def test_from_dict(self):
"document_store": {
"type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore",
"init_parameters": {
"connection_string": "postgresql://postgres:postgres@localhost:5432/postgres",
"connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"},
"table_name": "haystack_test_to_dict",
"embedding_dimension": 768,
"vector_function": "cosine_similarity",
Expand All @@ -83,7 +84,7 @@ def test_from_dict(self):
document_store = retriever.document_store

assert isinstance(document_store, PgvectorDocumentStore)
assert document_store.connection_string == "postgresql://postgres:postgres@localhost:5432/postgres"
assert isinstance(document_store.connection_string, EnvVarSecret)
assert document_store.table_name == "haystack_test_to_dict"
assert document_store.embedding_dimension == 768
assert document_store.vector_function == "cosine_similarity"
Expand Down

0 comments on commit 613e4ec

Please sign in to comment.