apache · damccorm · Dec 17, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/sdks/python/apache_beam/ml/rag/__init__.py b/sdks/python/apache_beam/ml/rag/__init__.py
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Apache Beam RAG (Retrieval Augmented Generation) components.
+This package provides components for building RAG pipelines in Apache Beam,
+including:
+- Chunking
+- Embedding generation
+- Vector storage
+- Vector search enrichment
+"""
diff --git a/sdks/python/apache_beam/ml/rag/chunking/__init__.py b/sdks/python/apache_beam/ml/rag/chunking/__init__.py
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Chunking components for RAG pipelines.
+This module provides components for splitting text into chunks for RAG
+pipelines.
+"""
diff --git a/sdks/python/apache_beam/ml/rag/chunking/base.py b/sdks/python/apache_beam/ml/rag/chunking/base.py
@@ -0,0 +1,94 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import abc
+import functools
+from collections.abc import Callable
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import apache_beam as beam
+from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.transforms.base import MLTransformProvider
+
+ChunkIdFn = Callable[[Chunk], str]
+
+
+def _assign_chunk_id(chunk_id_fn: ChunkIdFn, chunk: Chunk):
+  chunk.id = chunk_id_fn(chunk)
+  return chunk
+
+
+class ChunkingTransformProvider(MLTransformProvider):
+  def __init__(self, chunk_id_fn: Optional[ChunkIdFn] = None):
+    """Base class for chunking transforms in RAG pipelines.
+
+    ChunkingTransformProvider defines the interface for splitting documents
+    into chunks for embedding and retrieval. Implementations should define how
+    to split content while preserving metadata and managing chunk IDs.
+
+    The transform flow:
+    1. Takes input documents with content and metadata
+    2. Splits content into chunks using implementation-specific logic
+    3. Preserves document metadata in resulting chunks
+    4. Optionally assigns unique IDs to chunks (configurable via chunk_id_fn).
+
+    Example usage:
+      ```python
+      class MyChunker(ChunkingTransformProvider):
+        def get_splitter_transform(self):
+          return beam.ParDo(MySplitterDoFn())
+
+      chunker = MyChunker(chunk_id_fn=my_id_function)
+
+      with beam.Pipeline() as p:
+        chunks = (
+          p 
+          | beam.Create([{'text': 'document...', 'source': 'doc.txt'}])
+          | MLTransform(...).with_transform(chunker))
+      ```
+
+    Args:
+      chunk_id_fn: Optional function to generate chunk IDs. If not provided,
+        random UUIDs will be used. Function should take a Chunk and return str.
+    """
+    self.assign_chunk_id_fn = functools.partial(
+        _assign_chunk_id, chunk_id_fn) if chunk_id_fn is not None else None
+
+  @abc.abstractmethod
+  def get_splitter_transform(
+      self
+  ) -> beam.PTransform[beam.PCollection[Dict[str, Any]],
+                       beam.PCollection[Chunk]]:
+    """Creates transforms that emits splits for given content."""
+    raise NotImplementedError(
+        "Subclasses must implement get_splitter_transform")
+
+  def get_ptransform_for_processing(
+      self, **kwargs
+  ) -> beam.PTransform[beam.PCollection[Dict[str, Any]],
+                       beam.PCollection[Chunk]]:
+    """Creates transform for processing documents into chunks."""
+    ptransform = (
+        "Split document" >>
+        self.get_splitter_transform().with_output_types(Chunk))
+    if self.assign_chunk_id_fn:
+      ptransform = (
+          ptransform | "Assign chunk id" >> beam.Map(
+              self.assign_chunk_id_fn).with_output_types(Chunk))
+    return ptransform
diff --git a/sdks/python/apache_beam/ml/rag/chunking/base_test.py b/sdks/python/apache_beam/ml/rag/chunking/base_test.py
@@ -0,0 +1,129 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for apache_beam.ml.rag.chunking.base."""
+
+import unittest
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import pytest
+
+import apache_beam as beam
+from apache_beam.ml.rag.chunking.base import ChunkIdFn
+from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider
+from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.rag.types import Content
+from apache_beam.testing.test_pipeline import TestPipeline
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
+
+
+class WordSplitter(beam.DoFn):
+  def process(self, element):
+    words = element['text'].split()
+    for i, word in enumerate(words):
+      yield Chunk(
+          content=Content(text=word),
+          index=i,
+          metadata={'source': element['source']})
+
+
+class MockChunkingProvider(ChunkingTransformProvider):
+  def __init__(self, chunk_id_fn: Optional[ChunkIdFn] = None):
+    super().__init__(chunk_id_fn=chunk_id_fn)
+
+  def get_splitter_transform(
+      self
+  ) -> beam.PTransform[beam.PCollection[Dict[str, Any]],
+                       beam.PCollection[Chunk]]:
+    return beam.ParDo(WordSplitter())
+
+
+def chunk_equals(expected, actual):
+  """Custom equality function for Chunk objects."""
+  if not isinstance(expected, Chunk) or not isinstance(actual, Chunk):
+    return False
+  # Don't compare IDs since they're randomly generated
+  return (
+      expected.index == actual.index and expected.content == actual.content and
+      expected.metadata == actual.metadata)
+
+
+def id_equals(expected, actual):
+  """Custom equality function for Chunk object id's."""
+  if not isinstance(expected, Chunk) or not isinstance(actual, Chunk):
+    return False
+  return (expected.id == actual.id)
+
+
+@pytest.mark.uses_transformers
+class ChunkingTransformProviderTest(unittest.TestCase):
+  def setUp(self):
+    self.test_doc = {'text': 'hello world test', 'source': 'test.txt'}
+
+  def test_chunking_transform(self):
+    """Test the complete chunking transform."""
+    provider = MockChunkingProvider()
+
+    with TestPipeline() as p:
+      chunks = (
+          p
+          | beam.Create([self.test_doc])
+          | provider.get_ptransform_for_processing())
+
+      expected = [
+          Chunk(
+              content=Content(text="hello"),
+              index=0,
+              metadata={'source': 'test.txt'}),
+          Chunk(
+              content=Content(text="world"),
+              index=1,
+              metadata={'source': 'test.txt'}),
+          Chunk(
+              content=Content(text="test"),
+              index=2,
+              metadata={'source': 'test.txt'})
+      ]
+
+      assert_that(chunks, equal_to(expected, equals_fn=chunk_equals))
+
+  def test_custom_chunk_id_fn(self):
+    """Test the a custom chink id function."""
+    def source_index_id_fn(chunk: Chunk):
+      return f"{chunk.metadata['source']}_{chunk.index}"
+
+    provider = MockChunkingProvider(chunk_id_fn=source_index_id_fn)
+
+    with TestPipeline() as p:
+      chunks = (
+          p
+          | beam.Create([self.test_doc])
+          | provider.get_ptransform_for_processing())
+
+      expected = [
+          Chunk(content=Content(text="hello"), id="test.txt_0"),
+          Chunk(content=Content(text="world"), id="test.txt_1"),
+          Chunk(content=Content(text="test"), id="test.txt_2")
+      ]
+
+      assert_that(chunks, equal_to(expected, equals_fn=id_equals))
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/sdks/python/apache_beam/ml/rag/chunking/langchain.py b/sdks/python/apache_beam/ml/rag/chunking/langchain.py
@@ -0,0 +1,120 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import apache_beam as beam
+from apache_beam.ml.rag.chunking.base import ChunkIdFn
+from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider
+from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.rag.types import Content
+
+try:
+  from langchain.text_splitter import TextSplitter
+except ImportError:
+  TextSplitter = None  # type: ignore
+
+
+class LangChainChunker(ChunkingTransformProvider):
+  def __init__(
+      self,
+      text_splitter: TextSplitter,
+      document_field: str,
+      metadata_fields: List[str],
+      chunk_id_fn: Optional[ChunkIdFn] = None):
+    """A ChunkingTransformProvider that uses LangChain text splitters.
+
+    This provider integrates LangChain's text splitting capabilities into
+    Beam's MLTransform framework. It supports various text splitting strategies
+    through LangChain's TextSplitter interface, including recursive character
+    splitting and other methods. 
+
+    The provider:
+    - Takes documents with text content and metadata
+    - Splits text using configured LangChain splitter
+    - Preserves document metadata in resulting chunks
+    - Assigns unique IDs to chunks (configurable via chunk_id_fn)
+
+    Example usage:
+      ```python
+      from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+      splitter = RecursiveCharacterTextSplitter(
+          chunk_size=100,
+          chunk_overlap=20
+      )
+
+      chunker = LangChainChunker(text_splitter=splitter)
+
+      with beam.Pipeline() as p:
+        chunks = (
+            p 
+            | beam.Create([{'text': 'long document...', 'source': 'doc.txt'}])
+            | MLTransform(...).with_transform(chunker))
+      ```
+
+    Args:
+      text_splitter: A LangChain TextSplitter instance that defines how
+        documents are split into chunks.
+      metadata_fields: List of field names to copy from input documents to
+        chunk metadata. These fields will be preserved in each chunk created
+        from the document.
+      chunk_id_fn: Optional function that take a Chunk and return str to
+        generate chunk IDs. If not provided, random UUIDs will be used.
+    """
+    if not TextSplitter:
+      raise ImportError(
+          "langchain is required to use LangChainChunker"
+          "Please install it with using `pip install langchain`.")
+    if not isinstance(text_splitter, TextSplitter):
+      raise TypeError("text_splitter must be a LangChain TextSplitter")
+    if not document_field:
+      raise ValueError("document_field cannot be empty")
+    super().__init__(chunk_id_fn)
+    self.text_splitter = text_splitter
+    self.document_field = document_field
+    self.metadata_fields = metadata_fields
+
+  def get_splitter_transform(
+      self
+  ) -> beam.PTransform[beam.PCollection[Dict[str, Any]],
+                       beam.PCollection[Chunk]]:
+    return "Langchain text split" >> beam.ParDo(
+        _LangChainTextSplitter(
+            text_splitter=self.text_splitter,
+            document_field=self.document_field,
+            metadata_fields=self.metadata_fields))
+
+
+class _LangChainTextSplitter(beam.DoFn):
+  def __init__(
+      self,
+      text_splitter: TextSplitter,
+      document_field: str,
+      metadata_fields: List[str]):
+    self.text_splitter = text_splitter
+    self.document_field = document_field
+    self.metadata_fields = metadata_fields
+
+  def process(self, element):
+    text_chunks = self.text_splitter.split_text(element[self.document_field])
+    metadata = {field: element[field] for field in self.metadata_fields}
+    for i, text_chunk in enumerate(text_chunks):
+      yield Chunk(content=Content(text=text_chunk), index=i, metadata=metadata)