Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chroma: Replace existing documents with newer versions #1058

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions vectorstores/chroma/chroma.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
chromago "github.com/amikos-tech/chroma-go"
"github.com/amikos-tech/chroma-go/openai"
chromatypes "github.com/amikos-tech/chroma-go/types"
"github.com/google/uuid"
"github.com/tmc/langchaingo/embeddings"
"github.com/tmc/langchaingo/schema"
"github.com/tmc/langchaingo/vectorstores"
Expand Down Expand Up @@ -106,7 +105,12 @@ func (s Store) AddDocuments(ctx context.Context,
texts := make([]string, len(docs))
metadatas := make([]map[string]any, len(docs))
for docIdx, doc := range docs {
ids[docIdx] = uuid.New().String() // TODO (noodnik2): find & use something more meaningful
if opts.Deduplicater != nil {
if opts.Deduplicater(ctx, doc) {
continue
}
}
ids[docIdx] = opts.GenerateDocumentID(ctx, doc, ids)
texts[docIdx] = doc.PageContent
mc := make(map[string]any, 0)
maps.Copy(mc, doc.Metadata)
Expand All @@ -115,9 +119,8 @@ func (s Store) AddDocuments(ctx context.Context,
metadatas[docIdx][s.nameSpaceKey] = nameSpace
}
}

col := s.collection
if _, addErr := col.Add(ctx, nil, metadatas, texts, ids); addErr != nil {
if _, addErr := col.Upsert(ctx, nil, metadatas, texts, ids); addErr != nil {
return nil, fmt.Errorf("%w: %w", ErrAddDocument, addErr)
}
return ids, nil
Expand Down
3 changes: 1 addition & 2 deletions vectorstores/opensearch/opensearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"fmt"
"io"

"github.com/google/uuid"
opensearchgo "github.com/opensearch-project/opensearch-go"
"github.com/opensearch-project/opensearch-go/opensearchapi"
"github.com/tmc/langchaingo/embeddings"
Expand Down Expand Up @@ -75,7 +74,7 @@ func (s Store) AddDocuments(
}

for i, doc := range docs {
id := uuid.NewString()
id := opts.GenerateDocumentID(ctx, doc, ids)
_, err := s.documentIndexing(ctx, id, opts.NameSpace, doc.PageContent, vectors[i], doc.Metadata)
if err != nil {
return ids, err
Expand Down
43 changes: 38 additions & 5 deletions vectorstores/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ package vectorstores

import (
"context"
"log/slog"
"slices"

"github.com/google/uuid"
"github.com/tmc/langchaingo/embeddings"
"github.com/tmc/langchaingo/schema"
)
Expand All @@ -12,11 +15,12 @@ type Option func(*Options)

// Options is a set of options for similarity search and add documents.
type Options struct {
NameSpace string
ScoreThreshold float32
Filters any
Embedder embeddings.Embedder
Deduplicater func(context.Context, schema.Document) bool
NameSpace string
ScoreThreshold float32
Filters any
Embedder embeddings.Embedder
Deduplicater func(context.Context, schema.Document) bool
DocumentIDGenerater func(context.Context, schema.Document) string
}

// WithNameSpace returns an Option for setting the name space.
Expand Down Expand Up @@ -59,3 +63,32 @@ func WithDeduplicater(fn func(ctx context.Context, doc schema.Document) bool) Op
o.Deduplicater = fn
}
}

// WithIDGenerater returns an Option for setting to generate the IDS.
func WithIDGenerater(fn func(ctx context.Context, doc schema.Document) string) Option {
return func(o *Options) {
o.DocumentIDGenerater = fn
}
}

// generateDummyDoumentID generates a UUID.
func (o Options) generateDummyDoumentID(_ context.Context) string {
return uuid.NewString()
}

// GenerateDocumentID calls the provided ID generator or creates a new UUID if not provided or the generated ID is not unique.
func (o Options) GenerateDocumentID(ctx context.Context, doc schema.Document, ids []string) string {
if o.DocumentIDGenerater == nil {
return o.generateDummyDoumentID(ctx)
}
id := o.DocumentIDGenerater(ctx, doc)
if len(id) < 1 {
slog.Warn("Document ID generator did not generate an id", "id", id)
return o.generateDummyDoumentID(ctx)
}
if slices.Contains(ids, id) {
slog.Warn("Document ID generator generated a non unique id", "id", id)
return o.generateDummyDoumentID(ctx)
}
return id
}
2 changes: 1 addition & 1 deletion vectorstores/pgvector/pgvector.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ func (s Store) AddDocuments(

ids := make([]string, len(docs))
for docIdx, doc := range docs {
id := uuid.New().String()
id := opts.GenerateDocumentID(ctx, doc, ids)
ids[docIdx] = id
b.Queue(sql, id, doc.PageContent, pgvector.NewVector(vectors[docIdx]), doc.Metadata, s.collectionUUID)
}
Expand Down
3 changes: 1 addition & 2 deletions vectorstores/pinecone/pinecone.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"encoding/json"
"errors"

"github.com/google/uuid"
"github.com/pinecone-io/go-pinecone/pinecone"
"github.com/tmc/langchaingo/embeddings"
"github.com/tmc/langchaingo/schema"
Expand Down Expand Up @@ -104,7 +103,7 @@ func (s Store) AddDocuments(ctx context.Context,
return nil, err
}

id := uuid.New().String()
id := opts.GenerateDocumentID(ctx, docs[i], ids)
ids[i] = id
pineconeVectors = append(
pineconeVectors,
Expand Down
3 changes: 1 addition & 2 deletions vectorstores/weaviate/weaviate.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"strings"

"github.com/go-openapi/strfmt"
"github.com/google/uuid"
"github.com/tmc/langchaingo/embeddings"
"github.com/tmc/langchaingo/schema"
"github.com/tmc/langchaingo/vectorstores"
Expand Down Expand Up @@ -133,7 +132,7 @@ func (s Store) AddDocuments(ctx context.Context,
objects := make([]*models.Object, 0, len(docs))
ids := make([]string, len(docs))
for i := range docs {
id := strfmt.UUID(uuid.New().String())
id := strfmt.UUID(opts.GenerateDocumentID(ctx, docs[i], ids))
ids[i] = id.String()
objects = append(objects, &models.Object{
Class: s.indexName,
Expand Down