Skip to content

Commit

Permalink
community: BM25Retriever preservation of document id (#27019)
Browse files Browse the repository at this point in the history
Currently this retriever discards document ids

---------

Co-authored-by: asi-cider <[email protected]>
Co-authored-by: Erick Friis <[email protected]>
  • Loading branch information
3 people authored Dec 4, 2024
1 parent a009249 commit d34bf78
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 2 deletions.
17 changes: 15 additions & 2 deletions libs/community/langchain_community/retrievers/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def from_texts(
cls,
texts: Iterable[str],
metadatas: Optional[Iterable[dict]] = None,
ids: Optional[Iterable[str]] = None,
bm25_params: Optional[Dict[str, Any]] = None,
preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
**kwargs: Any,
Expand All @@ -42,6 +43,7 @@ def from_texts(
Args:
texts: A list of texts to vectorize.
metadatas: A list of metadata dicts to associate with each text.
ids: A list of ids to associate with each text.
bm25_params: Parameters to pass to the BM25 vectorizer.
preprocess_func: A function to preprocess each text before vectorization.
**kwargs: Any other arguments to pass to the retriever.
Expand All @@ -61,7 +63,15 @@ def from_texts(
bm25_params = bm25_params or {}
vectorizer = BM25Okapi(texts_processed, **bm25_params)
metadatas = metadatas or ({} for _ in texts)
docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)]
if ids:
docs = [
Document(page_content=t, metadata=m, id=i)
for t, m, i in zip(texts, metadatas, ids)
]
else:
docs = [
Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)
]
return cls(
vectorizer=vectorizer, docs=docs, preprocess_func=preprocess_func, **kwargs
)
Expand All @@ -86,11 +96,14 @@ def from_documents(
Returns:
A BM25Retriever instance.
"""
texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
texts, metadatas, ids = zip(
*((d.page_content, d.metadata, d.id) for d in documents)
)
return cls.from_texts(
texts=texts,
bm25_params=bm25_params,
metadatas=metadatas,
ids=ids,
preprocess_func=preprocess_func,
**kwargs,
)
Expand Down
39 changes: 39 additions & 0 deletions libs/community/tests/unit_tests/retrievers/test_bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,42 @@ def test_repr() -> None:
]
bm25_retriever = BM25Retriever.from_documents(documents=input_docs)
assert "I have a pen" not in repr(bm25_retriever)


@pytest.mark.requires("rank_bm25")
def test_doc_id() -> None:
docs_with_ids = [
Document(page_content="I have a pen.", id="1"),
Document(page_content="Do you have a pen?", id="2"),
Document(page_content="I have a bag.", id="3"),
]
docs_without_ids = [
Document(page_content="I have a pen."),
Document(page_content="Do you have a pen?"),
Document(page_content="I have a bag."),
]
docs_with_some_ids = [
Document(page_content="I have a pen.", id="1"),
Document(page_content="Do you have a pen?"),
Document(page_content="I have a bag.", id="3"),
]
bm25_retriever_with_ids = BM25Retriever.from_documents(documents=docs_with_ids)
bm25_retriever_without_ids = BM25Retriever.from_documents(
documents=docs_without_ids
)
bm25_retriever_with_some_ids = BM25Retriever.from_documents(
documents=docs_with_some_ids
)
for doc in bm25_retriever_with_ids.docs:
assert doc.id is not None
for doc in bm25_retriever_without_ids.docs:
assert doc.id is None
for doc in bm25_retriever_with_some_ids.docs:
if doc.page_content == "I have a pen.":
assert doc.id == "1"
elif doc.page_content == "Do you have a pen?":
assert doc.id is None
elif doc.page_content == "I have a bag.":
assert doc.id == "3"
else:
raise ValueError("Unexpected document")

0 comments on commit d34bf78

Please sign in to comment.