feat(Low-Code Concurrent CDK): Allow non-incremental substreams and l…

…ist based partition router streams with parents to be processed by the concurrent cdk (#89)
airbytehq · Dec 3, 2024 · cd1bd1c · cd1bd1c
1 parent 72202ee
commit cd1bd1c
Show file tree

Hide file tree

Showing 8 changed files with 226 additions and 55 deletions.
diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py
@@ -49,6 +49,7 @@
 from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
     AlwaysAvailableAvailabilityStrategy,
 )
+from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
 from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
 from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
 from airbyte_cdk.sources.types import Config, StreamState
@@ -69,6 +70,15 @@ def __init__(
         component_factory: Optional[ModelToComponentFactory] = None,
         **kwargs: Any,
     ) -> None:
+        # To reduce the complexity of the concurrent framework, we are not enabling RFR with synthetic
+        # cursors. We do this by no longer automatically instantiating RFR cursors when converting
+        # the declarative models into runtime components. Concurrent sources will continue to checkpoint
+        # incremental streams running in full refresh.
+        component_factory = component_factory or ModelToComponentFactory(
+            emit_connector_builder_messages=emit_connector_builder_messages,
+            disable_resumable_full_refresh=True,
+        )
+
         super().__init__(
             source_config=source_config,
             debug=debug,
@@ -191,13 +201,24 @@ def _group_streams(
             # these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
             # so we need to treat them as synchronous
             if isinstance(declarative_stream, DeclarativeStream):
-                datetime_based_cursor_component_definition = name_to_stream_mapping[
+                incremental_sync_component_definition = name_to_stream_mapping[
                     declarative_stream.name
                 ].get("incremental_sync")
 
+                partition_router_component_definition = (
+                    name_to_stream_mapping[declarative_stream.name]
+                    .get("retriever")
+                    .get("partition_router")
+                )
+
+                is_substream_without_incremental = (
+                    partition_router_component_definition
+                    and not incremental_sync_component_definition
+                )
+
                 if (
-                    datetime_based_cursor_component_definition
-                    and datetime_based_cursor_component_definition.get("type", "")
+                    incremental_sync_component_definition
+                    and incremental_sync_component_definition.get("type", "")
                     == DatetimeBasedCursorModel.__name__
                     and self._stream_supports_concurrent_partition_processing(
                         declarative_stream=declarative_stream
@@ -213,7 +234,7 @@ def _group_streams(
                         self._constructor.create_concurrent_cursor_from_datetime_based_cursor(
                             state_manager=state_manager,
                             model_type=DatetimeBasedCursorModel,
-                            component_definition=datetime_based_cursor_component_definition,
+                            component_definition=incremental_sync_component_definition,
                             stream_name=declarative_stream.name,
                             stream_namespace=declarative_stream.namespace,
                             config=config or {},
@@ -247,6 +268,41 @@ def _group_streams(
                             cursor=cursor,
                         )
                     )
+                elif is_substream_without_incremental and hasattr(
+                    declarative_stream.retriever, "stream_slicer"
+                ):
+                    partition_generator = StreamSlicerPartitionGenerator(
+                        DeclarativePartitionFactory(
+                            declarative_stream.name,
+                            declarative_stream.get_json_schema(),
+                            self._retriever_factory(
+                                name_to_stream_mapping[declarative_stream.name],
+                                config,
+                                {},
+                            ),
+                            self.message_repository,
+                        ),
+                        declarative_stream.retriever.stream_slicer,
+                    )
+
+                    final_state_cursor = FinalStateCursor(
+                        stream_name=declarative_stream.name,
+                        stream_namespace=declarative_stream.namespace,
+                        message_repository=self.message_repository,
+                    )
+
+                    concurrent_streams.append(
+                        DefaultStream(
+                            partition_generator=partition_generator,
+                            name=declarative_stream.name,
+                            json_schema=declarative_stream.get_json_schema(),
+                            availability_strategy=AlwaysAvailableAvailabilityStrategy(),
+                            primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
+                            cursor_field=None,
+                            logger=self.logger,
+                            cursor=final_state_cursor,
+                        )
+                    )
                 else:
                     synchronous_streams.append(declarative_stream)
             else:

diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py
@@ -387,6 +387,7 @@ def __init__(
         emit_connector_builder_messages: bool = False,
         disable_retries: bool = False,
         disable_cache: bool = False,
+        disable_resumable_full_refresh: bool = False,
         message_repository: Optional[MessageRepository] = None,
     ):
         self._init_mappings()
@@ -395,6 +396,7 @@ def __init__(
         self._emit_connector_builder_messages = emit_connector_builder_messages
         self._disable_retries = disable_retries
         self._disable_cache = disable_cache
+        self._disable_resumable_full_refresh = disable_resumable_full_refresh
         self._message_repository = message_repository or InMemoryMessageRepository(  # type: ignore
             self._evaluate_log_level(emit_connector_builder_messages)
         )
@@ -1339,6 +1341,8 @@ def _merge_stream_slicers(
                 if model.incremental_sync
                 else None
             )
+        elif self._disable_resumable_full_refresh:
+            return stream_slicer
         elif stream_slicer:
             # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor`
             return PerPartitionCursor(

diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py
@@ -67,6 +67,7 @@ def as_airbyte_stream(self) -> AirbyteStream:
             name=self.name,
             json_schema=dict(self._json_schema),
             supported_sync_modes=[SyncMode.full_refresh],
+            is_resumable=False,
         )
 
         if self._namespace:

diff --git a/unit_tests/sources/declarative/decoders/test_json_decoder.py b/unit_tests/sources/declarative/decoders/test_json_decoder.py
@@ -54,7 +54,7 @@ def test_jsonl_decoder(requests_mock, response_body, expected_json):
 def large_event_response_fixture():
     data = {"email": "[email protected]"}
     jsonl_string = f"{json.dumps(data)}\n"
-    lines_in_response = 2_000_000  # ≈ 58 MB of response
+    lines_in_response = 2  # ≈ 58 MB of response
     dir_path = os.path.dirname(os.path.realpath(__file__))
     file_path = f"{dir_path}/test_response.txt"
     with open(file_path, "w") as file: