community: BM25Retriever preservation of document id (#27019)

Currently this retriever discards document ids --------- Co-authored-by: asi-cider <[email protected]> Co-authored-by: Erick Friis <[email protected]>
langchain-ai · Dec 4, 2024 · d34bf78 · d34bf78
1 parent a009249
commit d34bf78
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 2 deletions.
diff --git a/libs/community/langchain_community/retrievers/bm25.py b/libs/community/langchain_community/retrievers/bm25.py
@@ -33,6 +33,7 @@ def from_texts(
         cls,
         texts: Iterable[str],
         metadatas: Optional[Iterable[dict]] = None,
+        ids: Optional[Iterable[str]] = None,
         bm25_params: Optional[Dict[str, Any]] = None,
         preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
         **kwargs: Any,
@@ -42,6 +43,7 @@ def from_texts(
         Args:
             texts: A list of texts to vectorize.
             metadatas: A list of metadata dicts to associate with each text.
+            ids: A list of ids to associate with each text.
             bm25_params: Parameters to pass to the BM25 vectorizer.
             preprocess_func: A function to preprocess each text before vectorization.
             **kwargs: Any other arguments to pass to the retriever.
@@ -61,7 +63,15 @@ def from_texts(
         bm25_params = bm25_params or {}
         vectorizer = BM25Okapi(texts_processed, **bm25_params)
         metadatas = metadatas or ({} for _ in texts)
-        docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)]
+        if ids:
+            docs = [
+                Document(page_content=t, metadata=m, id=i)
+                for t, m, i in zip(texts, metadatas, ids)
+            ]
+        else:
+            docs = [
+                Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)
+            ]
         return cls(
             vectorizer=vectorizer, docs=docs, preprocess_func=preprocess_func, **kwargs
         )
@@ -86,11 +96,14 @@ def from_documents(
         Returns:
             A BM25Retriever instance.
         """
-        texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
+        texts, metadatas, ids = zip(
+            *((d.page_content, d.metadata, d.id) for d in documents)
+        )
         return cls.from_texts(
             texts=texts,
             bm25_params=bm25_params,
             metadatas=metadatas,
+            ids=ids,
             preprocess_func=preprocess_func,
             **kwargs,
         )

diff --git a/libs/community/tests/unit_tests/retrievers/test_bm25.py b/libs/community/tests/unit_tests/retrievers/test_bm25.py
@@ -43,3 +43,42 @@ def test_repr() -> None:
     ]
     bm25_retriever = BM25Retriever.from_documents(documents=input_docs)
     assert "I have a pen" not in repr(bm25_retriever)
+
+
+@pytest.mark.requires("rank_bm25")
+def test_doc_id() -> None:
+    docs_with_ids = [
+        Document(page_content="I have a pen.", id="1"),
+        Document(page_content="Do you have a pen?", id="2"),
+        Document(page_content="I have a bag.", id="3"),
+    ]
+    docs_without_ids = [
+        Document(page_content="I have a pen."),
+        Document(page_content="Do you have a pen?"),
+        Document(page_content="I have a bag."),
+    ]
+    docs_with_some_ids = [
+        Document(page_content="I have a pen.", id="1"),
+        Document(page_content="Do you have a pen?"),
+        Document(page_content="I have a bag.", id="3"),
+    ]
+    bm25_retriever_with_ids = BM25Retriever.from_documents(documents=docs_with_ids)
+    bm25_retriever_without_ids = BM25Retriever.from_documents(
+        documents=docs_without_ids
+    )
+    bm25_retriever_with_some_ids = BM25Retriever.from_documents(
+        documents=docs_with_some_ids
+    )
+    for doc in bm25_retriever_with_ids.docs:
+        assert doc.id is not None
+    for doc in bm25_retriever_without_ids.docs:
+        assert doc.id is None
+    for doc in bm25_retriever_with_some_ids.docs:
+        if doc.page_content == "I have a pen.":
+            assert doc.id == "1"
+        elif doc.page_content == "Do you have a pen?":
+            assert doc.id is None
+        elif doc.page_content == "I have a bag.":
+            assert doc.id == "3"
+        else:
+            raise ValueError("Unexpected document")