diff --git a/libs/community/langchain_community/retrievers/bm25.py b/libs/community/langchain_community/retrievers/bm25.py index 543058c131a98..70910ce170f61 100644 --- a/libs/community/langchain_community/retrievers/bm25.py +++ b/libs/community/langchain_community/retrievers/bm25.py @@ -33,6 +33,7 @@ def from_texts( cls, texts: Iterable[str], metadatas: Optional[Iterable[dict]] = None, + ids: Optional[Iterable[str]] = None, bm25_params: Optional[Dict[str, Any]] = None, preprocess_func: Callable[[str], List[str]] = default_preprocessing_func, **kwargs: Any, @@ -42,6 +43,7 @@ def from_texts( Args: texts: A list of texts to vectorize. metadatas: A list of metadata dicts to associate with each text. + ids: A list of ids to associate with each text. bm25_params: Parameters to pass to the BM25 vectorizer. preprocess_func: A function to preprocess each text before vectorization. **kwargs: Any other arguments to pass to the retriever. @@ -61,7 +63,15 @@ def from_texts( bm25_params = bm25_params or {} vectorizer = BM25Okapi(texts_processed, **bm25_params) metadatas = metadatas or ({} for _ in texts) - docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)] + if ids: + docs = [ + Document(page_content=t, metadata=m, id=i) + for t, m, i in zip(texts, metadatas, ids) + ] + else: + docs = [ + Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas) + ] return cls( vectorizer=vectorizer, docs=docs, preprocess_func=preprocess_func, **kwargs ) @@ -86,11 +96,14 @@ def from_documents( Returns: A BM25Retriever instance. """ - texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents)) + texts, metadatas, ids = zip( + *((d.page_content, d.metadata, d.id) for d in documents) + ) return cls.from_texts( texts=texts, bm25_params=bm25_params, metadatas=metadatas, + ids=ids, preprocess_func=preprocess_func, **kwargs, ) diff --git a/libs/community/tests/unit_tests/retrievers/test_bm25.py b/libs/community/tests/unit_tests/retrievers/test_bm25.py index ef40b25ba7dee..81afae84c72c7 100644 --- a/libs/community/tests/unit_tests/retrievers/test_bm25.py +++ b/libs/community/tests/unit_tests/retrievers/test_bm25.py @@ -43,3 +43,42 @@ def test_repr() -> None: ] bm25_retriever = BM25Retriever.from_documents(documents=input_docs) assert "I have a pen" not in repr(bm25_retriever) + + +@pytest.mark.requires("rank_bm25") +def test_doc_id() -> None: + docs_with_ids = [ + Document(page_content="I have a pen.", id="1"), + Document(page_content="Do you have a pen?", id="2"), + Document(page_content="I have a bag.", id="3"), + ] + docs_without_ids = [ + Document(page_content="I have a pen."), + Document(page_content="Do you have a pen?"), + Document(page_content="I have a bag."), + ] + docs_with_some_ids = [ + Document(page_content="I have a pen.", id="1"), + Document(page_content="Do you have a pen?"), + Document(page_content="I have a bag.", id="3"), + ] + bm25_retriever_with_ids = BM25Retriever.from_documents(documents=docs_with_ids) + bm25_retriever_without_ids = BM25Retriever.from_documents( + documents=docs_without_ids + ) + bm25_retriever_with_some_ids = BM25Retriever.from_documents( + documents=docs_with_some_ids + ) + for doc in bm25_retriever_with_ids.docs: + assert doc.id is not None + for doc in bm25_retriever_without_ids.docs: + assert doc.id is None + for doc in bm25_retriever_with_some_ids.docs: + if doc.page_content == "I have a pen.": + assert doc.id == "1" + elif doc.page_content == "Do you have a pen?": + assert doc.id is None + elif doc.page_content == "I have a bag.": + assert doc.id == "3" + else: + raise ValueError("Unexpected document")