Skip to content

Commit

Permalink
feat(Low-Code Concurrent CDK): Allow non-incremental substreams and l…
Browse files Browse the repository at this point in the history
…ist based partition router streams with parents to be processed by the concurrent cdk (#89)
  • Loading branch information
brianjlai authored Dec 3, 2024
1 parent 72202ee commit cd1bd1c
Show file tree
Hide file tree
Showing 8 changed files with 226 additions and 55 deletions.
64 changes: 60 additions & 4 deletions airbyte_cdk/sources/declarative/concurrent_declarative_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
AlwaysAvailableAvailabilityStrategy,
)
from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
from airbyte_cdk.sources.types import Config, StreamState
Expand All @@ -69,6 +70,15 @@ def __init__(
component_factory: Optional[ModelToComponentFactory] = None,
**kwargs: Any,
) -> None:
# To reduce the complexity of the concurrent framework, we are not enabling RFR with synthetic
# cursors. We do this by no longer automatically instantiating RFR cursors when converting
# the declarative models into runtime components. Concurrent sources will continue to checkpoint
# incremental streams running in full refresh.
component_factory = component_factory or ModelToComponentFactory(
emit_connector_builder_messages=emit_connector_builder_messages,
disable_resumable_full_refresh=True,
)

super().__init__(
source_config=source_config,
debug=debug,
Expand Down Expand Up @@ -191,13 +201,24 @@ def _group_streams(
# these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
# so we need to treat them as synchronous
if isinstance(declarative_stream, DeclarativeStream):
datetime_based_cursor_component_definition = name_to_stream_mapping[
incremental_sync_component_definition = name_to_stream_mapping[
declarative_stream.name
].get("incremental_sync")

partition_router_component_definition = (
name_to_stream_mapping[declarative_stream.name]
.get("retriever")
.get("partition_router")
)

is_substream_without_incremental = (
partition_router_component_definition
and not incremental_sync_component_definition
)

if (
datetime_based_cursor_component_definition
and datetime_based_cursor_component_definition.get("type", "")
incremental_sync_component_definition
and incremental_sync_component_definition.get("type", "")
== DatetimeBasedCursorModel.__name__
and self._stream_supports_concurrent_partition_processing(
declarative_stream=declarative_stream
Expand All @@ -213,7 +234,7 @@ def _group_streams(
self._constructor.create_concurrent_cursor_from_datetime_based_cursor(
state_manager=state_manager,
model_type=DatetimeBasedCursorModel,
component_definition=datetime_based_cursor_component_definition,
component_definition=incremental_sync_component_definition,
stream_name=declarative_stream.name,
stream_namespace=declarative_stream.namespace,
config=config or {},
Expand Down Expand Up @@ -247,6 +268,41 @@ def _group_streams(
cursor=cursor,
)
)
elif is_substream_without_incremental and hasattr(
declarative_stream.retriever, "stream_slicer"
):
partition_generator = StreamSlicerPartitionGenerator(
DeclarativePartitionFactory(
declarative_stream.name,
declarative_stream.get_json_schema(),
self._retriever_factory(
name_to_stream_mapping[declarative_stream.name],
config,
{},
),
self.message_repository,
),
declarative_stream.retriever.stream_slicer,
)

final_state_cursor = FinalStateCursor(
stream_name=declarative_stream.name,
stream_namespace=declarative_stream.namespace,
message_repository=self.message_repository,
)

concurrent_streams.append(
DefaultStream(
partition_generator=partition_generator,
name=declarative_stream.name,
json_schema=declarative_stream.get_json_schema(),
availability_strategy=AlwaysAvailableAvailabilityStrategy(),
primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
cursor_field=None,
logger=self.logger,
cursor=final_state_cursor,
)
)
else:
synchronous_streams.append(declarative_stream)
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ def __init__(
emit_connector_builder_messages: bool = False,
disable_retries: bool = False,
disable_cache: bool = False,
disable_resumable_full_refresh: bool = False,
message_repository: Optional[MessageRepository] = None,
):
self._init_mappings()
Expand All @@ -395,6 +396,7 @@ def __init__(
self._emit_connector_builder_messages = emit_connector_builder_messages
self._disable_retries = disable_retries
self._disable_cache = disable_cache
self._disable_resumable_full_refresh = disable_resumable_full_refresh
self._message_repository = message_repository or InMemoryMessageRepository( # type: ignore
self._evaluate_log_level(emit_connector_builder_messages)
)
Expand Down Expand Up @@ -1339,6 +1341,8 @@ def _merge_stream_slicers(
if model.incremental_sync
else None
)
elif self._disable_resumable_full_refresh:
return stream_slicer
elif stream_slicer:
# For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor`
return PerPartitionCursor(
Expand Down
1 change: 1 addition & 0 deletions airbyte_cdk/sources/streams/concurrent/default_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def as_airbyte_stream(self) -> AirbyteStream:
name=self.name,
json_schema=dict(self._json_schema),
supported_sync_modes=[SyncMode.full_refresh],
is_resumable=False,
)

if self._namespace:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_jsonl_decoder(requests_mock, response_body, expected_json):
def large_event_response_fixture():
data = {"email": "[email protected]"}
jsonl_string = f"{json.dumps(data)}\n"
lines_in_response = 2_000_000 # ≈ 58 MB of response
lines_in_response = 2 # ≈ 58 MB of response
dir_path = os.path.dirname(os.path.realpath(__file__))
file_path = f"{dir_path}/test_response.txt"
with open(file_path, "w") as file:
Expand Down
Loading

0 comments on commit cd1bd1c

Please sign in to comment.