apache · damccorm · Dec 17, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/sdks/python/apache_beam/ml/rag/__init__.py b/sdks/python/apache_beam/ml/rag/__init__.py
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Apache Beam RAG (Retrieval Augmented Generation) components.
+This package provides components for building RAG pipelines in Apache Beam,
+including:
+- Chunking
+- Embedding generation
+- Vector storage
+- Vector search enrichment
+"""
diff --git a/sdks/python/apache_beam/ml/rag/chunking/__init__.py b/sdks/python/apache_beam/ml/rag/chunking/__init__.py
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Chunking components for RAG pipelines.
+This module provides components for splitting text into chunks for RAG
+pipelines.
+"""
diff --git a/sdks/python/apache_beam/ml/rag/chunking/base.py b/sdks/python/apache_beam/ml/rag/chunking/base.py
@@ -0,0 +1,92 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import abc
+import functools
+from collections.abc import Callable
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import apache_beam as beam
+from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.transforms.base import MLTransformProvider
+
+ChunkIdFn = Callable[[Chunk], str]
+
+
+def _assign_chunk_id(chunk_id_fn: ChunkIdFn, chunk: Chunk):
+  chunk.id = chunk_id_fn(chunk)
+  return chunk
+
+
+class ChunkingTransformProvider(MLTransformProvider):
+  def __init__(self, chunk_id_fn: Optional[ChunkIdFn] = None):
+    """Base class for chunking transforms in RAG pipelines.
+
+    ChunkingTransformProvider defines the interface for splitting documents
+    into chunks for embedding and retrieval. Implementations should define how
+    to split content while preserving metadata and managing chunk IDs.
+
+    The transform flow:
+    - Takes input documents with content and metadata
+    - Splits content into chunks using implementation-specific logic
+    - Preserves document metadata in resulting chunks
+    - Optionally assigns unique IDs to chunks (configurable via chunk_id_fn
+
+    Example usage:
+      >>> class MyChunker(ChunkingTransformProvider):
+      ...     def get_splitter_transform(self):
+      ...         return beam.ParDo(MySplitterDoFn())
+      ... 
+      >>> chunker = MyChunker(chunk_id_fn=my_id_function)
+      >>> 
+      >>> with beam.Pipeline() as p:
+      ...     chunks = (
+      ...         p 
+      ...         | beam.Create([{'text': 'document...', 'source': 'doc.txt'}])
+      ...         | MLTransform(...).with_transform(chunker))
+
+    Args:
+      chunk_id_fn: Optional function to generate chunk IDs. If not provided,
+        random UUIDs will be used. Function should take a Chunk and return str.
+    """
+    self.assign_chunk_id_fn = functools.partial(
+        _assign_chunk_id, chunk_id_fn) if chunk_id_fn is not None else None
+
+  @abc.abstractmethod
+  def get_splitter_transform(
+      self
+  ) -> beam.PTransform[beam.PCollection[Dict[str, Any]],
+                       beam.PCollection[Chunk]]:
+    """Creates transforms that emits splits for given content."""
+    raise NotImplementedError(
+        "Subclasses must implement get_splitter_transform")
+
+  def get_ptransform_for_processing(
+      self, **kwargs
+  ) -> beam.PTransform[beam.PCollection[Dict[str, Any]],
+                       beam.PCollection[Chunk]]:
+    """Creates transform for processing documents into chunks."""
+    ptransform = (
+        "Split document" >>
+        self.get_splitter_transform().with_output_types(Chunk))
+    if self.assign_chunk_id_fn:
+      ptransform = (
+          ptransform | "Assign chunk id" >> beam.Map(
+              self.assign_chunk_id_fn).with_output_types(Chunk))
+    return ptransform
diff --git a/sdks/python/apache_beam/ml/rag/chunking/base_test.py b/sdks/python/apache_beam/ml/rag/chunking/base_test.py
@@ -0,0 +1,139 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for apache_beam.ml.rag.chunking.base."""
+
+import unittest
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import pytest
+
+import apache_beam as beam
+from apache_beam.ml.rag.chunking.base import ChunkIdFn
+from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider
+from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.rag.types import Content
+from apache_beam.testing.test_pipeline import TestPipeline
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
+
+
+class WordSplitter(beam.DoFn):
+  def process(self, element):
+    words = element['text'].split()
+    for i, word in enumerate(words):
+      yield Chunk(
+          content=Content(text=word),
+          index=i,
+          metadata={'source': element['source']})
+
+
+class InvalidChunkingProvider(ChunkingTransformProvider):
+  def __init__(self, chunk_id_fn: Optional[ChunkIdFn] = None):
+    super().__init__(chunk_id_fn=chunk_id_fn)
+
+
+class MockChunkingProvider(ChunkingTransformProvider):
+  def __init__(self, chunk_id_fn: Optional[ChunkIdFn] = None):
+    super().__init__(chunk_id_fn=chunk_id_fn)
+
+  def get_splitter_transform(
+      self
+  ) -> beam.PTransform[beam.PCollection[Dict[str, Any]],
+                       beam.PCollection[Chunk]]:
+    return beam.ParDo(WordSplitter())
+
+
+def chunk_equals(expected, actual):
+  """Custom equality function for Chunk objects."""
+  if not isinstance(expected, Chunk) or not isinstance(actual, Chunk):
+    return False
+  # Don't compare IDs since they're randomly generated
+  return (
+      expected.index == actual.index and expected.content == actual.content and
+      expected.metadata == actual.metadata)
+
+
+def id_equals(expected, actual):
+  """Custom equality function for Chunk object id's."""
+  if not isinstance(expected, Chunk) or not isinstance(actual, Chunk):
+    return False
+  return (expected.id == actual.id)
+
+
+@pytest.mark.uses_transformers
+class ChunkingTransformProviderTest(unittest.TestCase):
+  def setUp(self):
+    self.test_doc = {'text': 'hello world test', 'source': 'test.txt'}
+
+  def test_doesnt_override_get_text_splitter_transform(self):
+    provider = InvalidChunkingProvider()
+    with self.assertRaises(NotImplementedError):
+      provider.get_splitter_transform()
+
+  def test_chunking_transform(self):
+    """Test the complete chunking transform."""
+    provider = MockChunkingProvider()
+
+    with TestPipeline() as p:
+      chunks = (
+          p
+          | beam.Create([self.test_doc])
+          | provider.get_ptransform_for_processing())
+
+      expected = [
+          Chunk(
+              content=Content(text="hello"),
+              index=0,
+              metadata={'source': 'test.txt'}),
+          Chunk(
+              content=Content(text="world"),
+              index=1,
+              metadata={'source': 'test.txt'}),
+          Chunk(
+              content=Content(text="test"),
+              index=2,
+              metadata={'source': 'test.txt'})
+      ]
+
+      assert_that(chunks, equal_to(expected, equals_fn=chunk_equals))
+
+  def test_custom_chunk_id_fn(self):
+    """Test the a custom chink id function."""
+    def source_index_id_fn(chunk: Chunk):
+      return f"{chunk.metadata['source']}_{chunk.index}"
+
+    provider = MockChunkingProvider(chunk_id_fn=source_index_id_fn)
+
+    with TestPipeline() as p:
+      chunks = (
+          p
+          | beam.Create([self.test_doc])
+          | provider.get_ptransform_for_processing())
+
+      expected = [
+          Chunk(content=Content(text="hello"), id="test.txt_0"),
+          Chunk(content=Content(text="world"), id="test.txt_1"),
+          Chunk(content=Content(text="test"), id="test.txt_2")
+      ]
+
+      assert_that(chunks, equal_to(expected, equals_fn=id_equals))
+
+
+if __name__ == '__main__':
+  unittest.main()