Linter fixes

apache · Dec 8, 2024 · 0b6bdde · 0b6bdde
1 parent 55cacf2
commit 0b6bdde
Show file tree

Hide file tree

Showing 14 changed files with 44 additions and 81 deletions.
diff --git a/sdks/python/apache_beam/ml/rag/chunking/base.py b/sdks/python/apache_beam/ml/rag/chunking/base.py
@@ -18,7 +18,7 @@
 import apache_beam as beam
 from apache_beam.ml.transforms.base import MLTransformProvider
 from apache_beam.ml.rag.types import Chunk
-from typing import List, Optional
+from typing import Optional
 from collections.abc import Callable
 import abc
 import uuid
@@ -37,7 +37,6 @@ def assign_chunk_id(chunk_id_fn: ChunkIdFn, chunk: Chunk):
 
 
 class ChunkingTransformProvider(MLTransformProvider):
-
   def __init__(self, chunk_id_fn: Optional[ChunkIdFn] = None):
     self.assign_chunk_id_fn = functools.partial(
         assign_chunk_id,

diff --git a/sdks/python/apache_beam/ml/rag/chunking/base_test.py b/sdks/python/apache_beam/ml/rag/chunking/base_test.py
@@ -25,11 +25,10 @@
 from apache_beam.testing.util import equal_to
 from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider, ChunkIdFn
 from apache_beam.ml.rag.types import Chunk, Content
-from typing import List, Optional
+from typing import Optional
 
 
 class WordSplitter(beam.DoFn):
-
   def process(self, element):
     words = element['text'].split()
     for i, word in enumerate(words):
@@ -40,7 +39,6 @@ def process(self, element):
 
 
 class MockChunkingProvider(ChunkingTransformProvider):
-
   def __init__(self, chunk_id_fn: Optional[ChunkIdFn] = None):
     super().__init__(chunk_id_fn=chunk_id_fn)
 
@@ -67,7 +65,6 @@ def id_equals(expected, actual):
 
 @pytest.mark.uses_transformers
 class ChunkingTransformProviderTest(unittest.TestCase):
-
   def setUp(self):
     self.test_doc = {'text': 'hello world test', 'source': 'test.txt'}
 
@@ -100,7 +97,6 @@ def test_chunking_transform(self):
 
   def test_custom_chunk_id_fn(self):
     """Test the a custom chink id function."""
-
     def source_index_id_fn(chunk: Chunk):
       return f"{chunk.metadata['source']}_{chunk.index}"
 

diff --git a/sdks/python/apache_beam/ml/rag/chunking/langchain.py b/sdks/python/apache_beam/ml/rag/chunking/langchain.py
@@ -23,12 +23,11 @@
 
 
 class LangChainChunkingProvider(ChunkingTransformProvider):
-
   def __init__(
       self,
       text_splitter: TextSplitter,
       document_field: str,
-      metadata_fields: List[str] = [],
+      metadata_fields: List[str],
       chunk_id_fn: Optional[ChunkIdFn] = None):
     if not isinstance(text_splitter, TextSplitter):
       raise TypeError("text_splitter must be a LangChain TextSplitter")
@@ -48,7 +47,6 @@ def get_text_splitter_transform(self) -> beam.DoFn:
 
 
 class LangChainTextSplitter(beam.DoFn):
-
   def __init__(
       self,
       text_splitter: TextSplitter,

diff --git a/sdks/python/apache_beam/ml/rag/chunking/langchain_test.py b/sdks/python/apache_beam/ml/rag/chunking/langchain_test.py
@@ -19,17 +19,21 @@
 import unittest
 
 import apache_beam as beam
+
 from apache_beam.testing.test_pipeline import TestPipeline
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
-
-from langchain.text_splitter import (
-    RecursiveCharacterTextSplitter,
-    CharacterTextSplitter,
-)
-
 from apache_beam.ml.rag.chunking.langchain import LangChainChunkingProvider
-from apache_beam.ml.rag.types import Chunk, Content
+from apache_beam.ml.rag.types import Chunk
+
+try:
+  from langchain.text_splitter import (
+      RecursiveCharacterTextSplitter,
+      CharacterTextSplitter,
+  )
+  LANGCHAIN_AVAILABLE = True
+except ImportError:
+  LANGCHAIN_AVAILABLE = False
 
 # Import optional dependencies
 try:
@@ -49,7 +53,6 @@ def chunk_equals(expected, actual):
 
 
 class LangChainChunkingTest(unittest.TestCase):
-
   def setUp(self):
     self.simple_text = {
         'content': 'This is a simple test document. It has multiple sentences. '
@@ -72,7 +75,7 @@ def test_no_metadata_fields(self):
     """Test chunking with no metadata fields specified."""
     splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
     provider = LangChainChunkingProvider(
-        document_field='content', text_splitter=splitter)
+        document_field='content', metadata_fields=[], text_splitter=splitter)
 
     with TestPipeline() as p:
       chunks = (
@@ -102,7 +105,8 @@ def test_multiple_metadata_fields(self):
 
       assert_that(chunks_count, lambda x: x[0] > 0, 'Has chunks')
       assert_that(
-          chunks, lambda x: all(
+          chunks,
+          lambda x: all(
               c.metadata == {
                   'source': 'simple.txt', 'language': 'en'
               } for c in x))

diff --git a/sdks/python/apache_beam/ml/rag/embeddings/base.py b/sdks/python/apache_beam/ml/rag/embeddings/base.py
@@ -32,9 +32,13 @@ def create_rag_adapter() -> EmbeddingTypeAdapter:
     """
   return EmbeddingTypeAdapter(
       input_fn=lambda chunks: [chunk.content.text for chunk in chunks],
-      output_fn=lambda chunks, embeddings: [
+      output_fn=lambda chunks,
+      embeddings: [
           Embedding(
-              id=chunk.id, dense_embedding=embeddings, sparse_embedding=None,
-              metadata=chunk.metadata, content=chunk.content)
-          for chunk, embeddings in zip(chunks, embeddings)
+              id=chunk.id,
+              dense_embedding=embeddings,
+              sparse_embedding=None,
+              metadata=chunk.metadata,
+              content=chunk.content) for chunk,
+          embeddings in zip(chunks, embeddings)
       ])
diff --git a/sdks/python/apache_beam/ml/rag/embeddings/base_test.py b/sdks/python/apache_beam/ml/rag/embeddings/base_test.py
@@ -20,7 +20,6 @@
 
 
 class RAGBaseEmbeddingsTest(unittest.TestCase):
-
   def setUp(self):
     self.test_chunks = [
         Chunk(

diff --git a/sdks/python/apache_beam/ml/rag/embeddings/huggingface.py b/sdks/python/apache_beam/ml/rag/embeddings/huggingface.py
@@ -38,7 +38,6 @@ class HuggingfaceTextEmbeddings(EmbeddingsManager):
     - Copies Chunk.metadata to Embedding.metadata
     - Converts model output to Embedding.dense_embedding
     """
-
   def __init__(
       self, model_name: str, *, max_seq_length: Optional[int] = None, **kwargs):
     """Initialize RAG embeddings.

diff --git a/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py
@@ -26,16 +26,16 @@
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
 from apache_beam.ml.transforms.base import MLTransform
+from apache_beam.ml.rag.types import Chunk, Content, Embedding
+from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings
 
+# pylint: disable=unused-import
 try:
   from sentence_transformers import SentenceTransformer
   SENTENCE_TRANSFORMERS_AVAILABLE = True
 except ImportError:
   SENTENCE_TRANSFORMERS_AVAILABLE = False
 
-from apache_beam.ml.rag.types import Chunk, Content, Embedding
-from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings
-
 
 def embedding_approximately_equals(expected, actual):
   """Compare embeddings allowing for numerical differences."""
@@ -52,7 +52,6 @@ def embedding_approximately_equals(expected, actual):
 @unittest.skipIf(
     not SENTENCE_TRANSFORMERS_AVAILABLE, "sentence-transformers not available")
 class HuggingfaceTextEmbeddingsTest(unittest.TestCase):
-
   def setUp(self):
     self.artifact_location = tempfile.mkdtemp(prefix='sentence_transformers_')
     self.test_chunks = [
@@ -89,7 +88,6 @@ def test_embedding_pipeline(self):
             },
             content=Content(text="This is a test sentence."))
     ]
-    """Test the complete embedding pipeline."""
     embedder = HuggingfaceTextEmbeddings(
         model_name="sentence-transformers/all-MiniLM-L6-v2")
 

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/bigquery_vector_search.py b/sdks/python/apache_beam/ml/rag/enrichment/bigquery_vector_search.py
@@ -100,7 +100,6 @@ class BigQueryVectorSearchEnrichmentHandler(
     EnrichmentSourceHandler[Union[Embedding, List[Embedding]],
                             Union[Embedding, List[Embedding]]]):
   """Enrichment handler for BigQuery vector search."""
-
   def __init__(
       self,
       project: str,

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/base.py b/sdks/python/apache_beam/ml/rag/ingestion/base.py
@@ -24,7 +24,6 @@ class VectorDatabaseConfig(ABC):
     Implementations should provide database-specific configuration and
     create appropriate write transforms.
     """
-
   @abstractmethod
   def create_write_transform(self) -> beam.PTransform:
     """Creates a PTransform that writes to the vector database.
@@ -40,7 +39,6 @@ class VectorDatabaseWriteTransform(beam.PTransform):
     
     Uses the provided database config to create an appropriate write transform.
     """
-
   def __init__(self, database_config: VectorDatabaseConfig):
     """Initialize transform with database config.
         

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/base_test.py b/sdks/python/apache_beam/ml/rag/ingestion/base_test.py
@@ -25,14 +25,12 @@
 
 class MockWriteTransform(beam.PTransform):
   """Mock transform that returns element."""
-
   def expand(self, pcoll):
     return pcoll | beam.Map(lambda x: x)
 
 
 class MockDatabaseConfig(VectorDatabaseConfig):
   """Mock database config for testing."""
-
   def __init__(self):
     self.write_transform = MockWriteTransform()
 
@@ -41,7 +39,6 @@ def create_write_transform(self) -> beam.PTransform:
 
 
 class VectorDatabaseBaseTest(unittest.TestCase):
-
   def test_write_transform_creation(self):
     """Test that write transform is created correctly."""
     config = MockDatabaseConfig()

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/bigquery.py b/sdks/python/apache_beam/ml/rag/ingestion/bigquery.py
@@ -56,8 +56,7 @@ def create_write_transform(self) -> beam.PTransform:
 
 
 class _WriteToBigQueryVectorDatabase(beam.PTransform):
-  """Implementation of BigQuery vector database write."""
-
+  """Implementation of BigQuery vector database write. """
   def __init__(self, config: BigQueryVectorWriterConfig):
     self.config = config
 
@@ -69,9 +68,7 @@ def expand(self, pcoll: beam.PCollection[Embedding]):
             id=lambda x: str(x.id),
             embedding=lambda x: [float(v) for v in x.dense_embedding],
             content=lambda x: str(x.content.text),
-            metadata=lambda x: {
-                str(k): str(v)
-                for k, v in x.metadata.items()
-            })
+            metadata=lambda x: {str(k): str(v)
+                                for k, v in x.metadata.items()})
         | "Write to BigQuery" >> beam.managed.Write(
             beam.managed.BIGQUERY, config=self.config.write_config))
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,7 +20,6 @@ @@
     class RAGBaseEmbeddingsTest(unittest.TestCase):
       def setUp(self):
         self.test_chunks = [
             Chunk(
@@ Expand Down @@