Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: add samples to migrate pinecone to alloy db #292

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
165 changes: 165 additions & 0 deletions samples/migrations/snippets/alloydb_snippets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
import sys
import uuid
from typing import Optional

# [START langchain_alloydb_get_client]
from langchain_google_alloydb_pg import AlloyDBEngine


async def aget_client(
vishwarajanand marked this conversation as resolved.
Show resolved Hide resolved
project_id: str,
region: str,
cluster: str,
instance: str,
database: str,
user: Optional[str] = None,
password: Optional[str] = None,
) -> AlloyDBEngine:
engine = await AlloyDBEngine.afrom_instance(
project_id=project_id,
region=region,
cluster=cluster,
instance=instance,
database=database,
user=user,
password=password,
)

print("Langchain AlloyDB client initiated.")
return engine


# [END langchain_alloydb_get_client]

# [START langchain_alloydb_fake_embedding_service]
from langchain_core.embeddings import FakeEmbeddings


def get_embeddings_service(size: int) -> FakeEmbeddings:
embeddings_service = FakeEmbeddings(size=size)

print("Langchain FakeEmbeddings service initiated.")
return embeddings_service


# [END langchain_alloydb_fake_embedding_service]


# [START langchain_create_alloydb_vector_store_table]
async def ainit_vector_store(
engine: AlloyDBEngine, table_name: str, vector_size: int, **kwargs: dict
) -> None:
await engine.ainit_vectorstore_table(
table_name=table_name,
vector_size=vector_size,
overwrite_existing=True,
**kwargs,
)

print("Langchain AlloyDB vector store table initialized.")


# [END langchain_create_alloydb_vector_store_table]


# [START langchain_get_alloydb_vector_store]
from langchain_core.embeddings import Embeddings

from langchain_google_alloydb_pg import AlloyDBVectorStore


async def aget_vector_store(
engine: AlloyDBEngine, embeddings_service: Embeddings, table_name: str
) -> AlloyDBVectorStore:
vector_store = await AlloyDBVectorStore.create(
engine=engine,
embedding_service=embeddings_service,
table_name=table_name,
)

print("Langchain AlloyDB vector store instantiated.")
return vector_store


# [END langchain_get_alloydb_vector_store]


# [START langchain_alloydb_vector_store_insert_data]
async def ainsert_data(
vector_store: AlloyDBVectorStore,
texts: list[str],
embeddings: list[list[float]],
metadatas: list[dict],
ids: list[str],
) -> list[str]:
inserted_ids = await vector_store.aadd_embeddings(
texts=texts,
embeddings=embeddings,
metadatas=metadatas,
ids=ids,
)

print("AlloyDB client fetched all data from index.")
return inserted_ids


# [END langchain_alloydb_vector_store_insert_data]


async def main() -> None:
client = await aget_client(
project_id=sys.argv[1],
region=sys.argv[2],
cluster=sys.argv[3],
instance=sys.argv[4],
database=sys.argv[5],
user=sys.argv[6],
password=sys.argv[7],
)
# In case you're using a different embeddings service, choose one from [LangChain's Embedding models](https://python.langchain.com/v0.2/docs/integrations/text_embedding/).
embeddings_service = get_embeddings_service(size=768)
await ainit_vector_store(
engine=client,
table_name=sys.argv[8],
vector_size=768,
)
vs = await aget_vector_store(
engine=client,
embeddings_service=embeddings_service,
table_name=sys.argv[8],
)
# sample rows
ids = [str(uuid.uuid4())]
contents = ["content_1"]
embeddings = embeddings_service.embed_documents(contents)
metadatas = [{} for _ in contents]
ids = await ainsert_data(
vector_store=vs,
ids=ids,
contents=contents,
embeddings=embeddings,
metadatas=metadatas,
)
await client.close()
print(f"Inserted {len(ids)} values to Langchain Alloy DB Vector Store.")


if __name__ == "__main__":
asyncio.run(main())
101 changes: 101 additions & 0 deletions samples/migrations/snippets/pinecone_snippets.py
vishwarajanand marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys

# [START pinecone_get_client]
from pinecone import Index, Pinecone, ServerlessSpec


def get_client(pinecone_api_key: str) -> Pinecone:
pc = Pinecone(
api_key=pinecone_api_key,
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

print("Pinecone client initiated.")
return pc


# [END pinecone_get_client]

# [START pinecone_get_index]
from pinecone import Pinecone, ServerlessSpec


def get_index(client: Pinecone, index_name: str = "index-name") -> Index:
index = client.Index(index_name)

print("Pinecone index reference initiated.")
return index


# [END pinecone_get_index]


# [START pinecone_get_all_ids]
def get_all_ids(index: Index, namespace="") -> list[str]:
results = index.list_paginated(prefix="", namespace=namespace)
ids = [v.id for v in results.vectors]
while results.pagination is not None:
pagination_token = results.pagination.next
results = index.list_paginated(prefix="", pagination_token=pagination_token)
ids.extend([v.id for v in results.vectors])

print("Pinecone client fetched all ids from index.")

return ids


# [END pinecone_get_all_ids]


# [START pinecone_get_all_data]
def get_all_data(
index: Index, ids: list[str]
) -> tuple[list[str], list[str], list[list[float]], list[dict]]:
all_data = index.fetch(ids=ids)
ids = []
embeddings = []
contents = []
metadatas = []
for doc in all_data["vectors"].values():
ids.append(doc["id"])
embeddings.append(doc["values"])
contents.append(str(doc["metadata"]))
metadata = doc["metadata"]
metadatas.append(metadata)

print("Pinecone client fetched all data from index.")
return ids, contents, embeddings, metadatas


# [END pinecone_get_all_data]


if __name__ == "__main__":
client = get_client(
pinecone_api_key=sys.argv[1],
)
index = get_index(
client=client,
index_name=sys.argv[2],
)
ids = get_all_ids(
index=index,
)
ids, content, embeddings, metadatas = get_all_data(index=index, ids=ids)
print(f"Downloaded {len(ids)} values from Pinecone.")
2 changes: 2 additions & 0 deletions samples/migrations/snippets/requirements-test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pytest==8.3.3
pytest-asyncio==0.24.0
11 changes: 11 additions & 0 deletions samples/migrations/snippets/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
langchain-google-alloydb-pg==0.8.0
langchain-core==0.3.25
# Pinecone has a grpc option
# pinecone[grpc]==5.0.1
pinecone==5.4.2
weaviate-client==4.10.2
langchain-chroma==0.1.4
qdrant-client==1.12.1
pymilvus==2.5.0
protobuf==5.29.1
grpcio-tools==1.67.1
Loading
Loading