From 2dddf8ec29610b7e323a4ab963b4c8d77638b333 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 26 Nov 2024 03:55:02 +0100 Subject: [PATCH 01/25] Add component resolver and http component resolver --- .../declarative_component_schema.yaml | 82 ++++++++++ .../models/declarative_component_schema.py | 58 ++++++++ .../parsers/model_to_component_factory.py | 113 +++++++++++--- .../declarative/partition_routers/__init__.py | 3 +- .../sources/declarative/resolvers/__init__.py | 8 + .../resolvers/components_resolver.py | 49 ++++++ .../resolvers/http_components_resolver.py | 140 ++++++++++++++++++ 7 files changed, 431 insertions(+), 22 deletions(-) create mode 100644 airbyte_cdk/sources/declarative/resolvers/__init__.py create mode 100644 airbyte_cdk/sources/declarative/resolvers/components_resolver.py create mode 100644 airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 654a6b751..cfba4f3ae 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -2700,6 +2700,88 @@ definitions: $parameters: type: object additionalProperties: true + ComponentMappingDefinition: + title: Component Mapping Definition + description: Specifies a mapping definition to update or add fields in a record or configuration. This allows dynamic mapping of data by interpolating values into the template based on provided contexts. + type: object + required: + - type + - key + - value + properties: + type: + type: string + enum: [ComponentMappingDefinition] + key: + title: Key + description: The target key in the stream template where the value will be added or updated. + type: string + value: + title: Value + description: The dynamic or static value to assign to the key. Interpolated values can be used to dynamically determine the value during runtime. + type: string + interpolation_context: + - config + - stream_template_config + - components_values + examples: + - "{{ components_values['updates'] }}" + - "{{ components_values['MetaData']['LastUpdatedTime'] }}" + - "{{ config['segment_id'] }}" + value_type: + title: Value Type + description: The expected data type of the value. If omitted, the type will be inferred from the value provided. + "$ref": "#/definitions/ValueType" + condition: + title: Condition + description: An optional condition that must evaluate to `true` for the mapping to be applied. This can use interpolation for dynamic evaluation. + type: string + default: "" + interpolation_context: + - config + - stream_template_config + - components_values + examples: + - "{{ components_values['created_at'] >= stream_interval['start_time'] }}" + - "{{ components_values.status in ['active', 'expired'] }}" + $parameters: + type: object + additionalProperties: true + HttpComponentsResolver: + type: object + properties: + type: + type: string + enum: [HttpComponentsResolver] + retriever: + title: Retriever + description: Component used to coordinate how records are extracted across stream slices and request pages. + anyOf: + - "$ref": "#/definitions/AsyncRetriever" + - "$ref": "#/definitions/CustomRetriever" + - "$ref": "#/definitions/SimpleRetriever" + components_mapping: + type: array + items: + - "$ref": "#/definitions/ComponentMappingDefinition" + $parameters: + type: object + additionalProperties: true + required: + - type + - retriever + - components_mapping + DynamicDeclarativeStream: + type: object + properties: + stream_template: + title: Stream Template + description: Reference to the stream template. + "$ref": "#/definitions/DeclarativeStream" + components_resolver: + anyOf: + - "$ref": "#/definitions/HttpComponentsResolver" + - "$ref": "#/definitions/ConfigDrivenComponentsParser" interpolation: variables: - title: config diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 174334d9c..bd8f86eab 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -1031,6 +1031,44 @@ class WaitUntilTimeFromHeader(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class ComponentMappingDefinition(BaseModel): + type: Literal["ComponentMappingDefinition"] + key: str = Field( + ..., + description="The target key in the stream template where the value will be added or updated.", + title="Key", + ) + value: str = Field( + ..., + description="The dynamic or static value to assign to the key. Interpolated values can be used to dynamically determine the value during runtime.", + examples=[ + "{{ components_values['updates'] }}", + "{{ components_values['MetaData']['LastUpdatedTime'] }}", + "{{ config['segment_id'] }}", + ], + title="Value", + ) + value_type: Optional[ValueType] = Field( + None, + description="The expected data type of the value. If omitted, the type will be inferred from the value provided.", + title="Value Type", + ) + condition: Optional[str] = Field( + "", + description="An optional condition that must evaluate to `true` for the mapping to be applied. This can use interpolation for dynamic evaluation.", + examples=[ + "{{ components_values['created_at'] >= stream_interval['start_time'] }}", + "{{ components_values.status in ['active', 'expired'] }}", + ], + title="Condition", + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") + + +class ConfigDrivenComponentsParser(BaseModel): + __root__: Any + + class AddedFieldDefinition(BaseModel): type: Literal["AddedFieldDefinition"] path: List[str] = Field( @@ -1739,6 +1777,26 @@ class SubstreamPartitionRouter(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class HttpComponentsResolver(BaseModel): + type: Literal["HttpComponentsResolver"] + retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field( + ..., + description="Component used to coordinate how records are extracted across stream slices and request pages.", + title="Retriever", + ) + components_mapping: List[ComponentMappingDefinition] + parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") + + +class DynamicDeclarativeStream(BaseModel): + stream_template: Optional[DeclarativeStream] = Field( + None, description="Reference to the stream template.", title="Stream Template" + ) + components_resolver: Optional[Union[HttpComponentsResolver, ConfigDrivenComponentsParser]] = ( + None + ) + + CompositeErrorHandler.update_forward_refs() DeclarativeSource.update_forward_refs() SelectiveAuthenticator.update_forward_refs() diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index d2dd9d9dc..c81d901d2 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -30,6 +30,10 @@ from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository +from airbyte_cdk.sources.declarative.resolvers import ( + HttpComponentsResolver, + ComponentMappingDefinition, +) from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( @@ -277,6 +281,12 @@ SimpleRetriever as SimpleRetrieverModel, ) from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + HttpComponentsResolver as HttpComponentsResolverModel, +) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + ComponentMappingDefinition as ComponentMappingDefinitionModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( SubstreamPartitionRouter as SubstreamPartitionRouterModel, ) @@ -291,6 +301,7 @@ XmlDecoder as XmlDecoderModel, ) from airbyte_cdk.sources.declarative.partition_routers import ( + PartitionRouter, CartesianProductStreamSlicer, ListPartitionRouter, SinglePartitionRouter, @@ -461,6 +472,8 @@ def _init_mappings(self) -> None: WaitTimeFromHeaderModel: self.create_wait_time_from_header, WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, AsyncRetrieverModel: self.create_async_retriever, + HttpComponentsResolverModel: self.create_http_components_resolver, + ComponentMappingDefinitionModel: self.create_components_mapping_definition, } # Needed for the case where we need to perform a second parse on the fields of a custom component @@ -1279,19 +1292,20 @@ def create_declarative_stream( parameters=model.parameters or {}, ) - def _merge_stream_slicers( - self, model: DeclarativeStreamModel, config: Config - ) -> Optional[StreamSlicer]: - stream_slicer = None + def _build_stream_slicer_from_partition_router( + self, + model: Union[AsyncRetrieverModel, CustomRetrieverModel, SimpleRetrieverModel], + config: Config, + ) -> Optional[PartitionRouter]: if ( - hasattr(model.retriever, "partition_router") - and isinstance(model.retriever, SimpleRetrieverModel) - and model.retriever.partition_router + hasattr(model, "partition_router") + and isinstance(model, SimpleRetrieverModel) + and model.partition_router ): - stream_slicer_model = model.retriever.partition_router + stream_slicer_model = model.partition_router if isinstance(stream_slicer_model, list): - stream_slicer = CartesianProductStreamSlicer( + return CartesianProductStreamSlicer( [ self._create_component_from_model(model=slicer, config=config) for slicer in stream_slicer_model @@ -1299,9 +1313,24 @@ def _merge_stream_slicers( parameters={}, ) else: - stream_slicer = self._create_component_from_model( - model=stream_slicer_model, config=config - ) + return self._create_component_from_model(model=stream_slicer_model, config=config) # type: ignore[no-any-return] + # Will be created PartitionRouter as stream_slicer_model is model.partition_router + return None + + def _build_resumable_cursor_from_paginator( + self, + model: Union[AsyncRetrieverModel, CustomRetrieverModel, SimpleRetrieverModel], + stream_slicer: Optional[StreamSlicer], + ) -> Optional[StreamSlicer]: + if hasattr(model, "paginator") and model.paginator and not stream_slicer: + # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` + return ResumableFullRefreshCursor(parameters={}) + return None + + def _merge_stream_slicers( + self, model: DeclarativeStreamModel, config: Config + ) -> Optional[StreamSlicer]: + stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) if model.incremental_sync and stream_slicer: incremental_sync_model = model.incremental_sync @@ -1342,15 +1371,7 @@ def _merge_stream_slicers( ), partition_router=stream_slicer, ) - elif ( - hasattr(model.retriever, "paginator") - and model.retriever.paginator - and not stream_slicer - ): - # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` - return ResumableFullRefreshCursor(parameters={}) - else: - return None + return self._build_resumable_cursor_from_paginator(model.retriever, stream_slicer) def create_default_error_handler( self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any @@ -2182,3 +2203,53 @@ def get_message_repository(self) -> MessageRepository: def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: return Level.DEBUG if emit_connector_builder_messages else Level.INFO + + @staticmethod + def create_components_mapping_definition( + model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any + ) -> ComponentMappingDefinition: + interpolated_value = InterpolatedString.create( + model.value, parameters=model.parameters or {} + ) + return ComponentMappingDefinition( + key=model.key, + value=interpolated_value, + value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), + condition=model.condition or "", + parameters=model.parameters or {}, + ) + + def create_http_components_resolver( + self, model: HttpComponentsResolverModel, config: Config + ) -> Any: + stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) + combined_slicers = self._build_resumable_cursor_from_paginator( + model.retriever, stream_slicer + ) + + retriever = self._create_component_from_model( + model=model.retriever, + config=config, + name="", + primary_key=None, + stream_slicer=combined_slicers, + transformations=[], + ) + + components_mapping = [ + self._create_component_from_model( + model=components_mapping_definition_model, + value_type=ModelToComponentFactory._json_schema_type_name_to_type( + components_mapping_definition_model.value_type + ), + config=config, + ) + for components_mapping_definition_model in model.components_mapping + ] + + return HttpComponentsResolver( + retriever=retriever, + config=config, + components_mapping=components_mapping, + parameters=model.parameters or {}, + ) diff --git a/airbyte_cdk/sources/declarative/partition_routers/__init__.py b/airbyte_cdk/sources/declarative/partition_routers/__init__.py index 86e472a42..9487f5e1d 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/__init__.py +++ b/airbyte_cdk/sources/declarative/partition_routers/__init__.py @@ -6,5 +6,6 @@ from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import ListPartitionRouter from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import SubstreamPartitionRouter +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter -__all__ = ["CartesianProductStreamSlicer", "ListPartitionRouter", "SinglePartitionRouter", "SubstreamPartitionRouter"] +__all__ = ["CartesianProductStreamSlicer", "ListPartitionRouter", "SinglePartitionRouter", "SubstreamPartitionRouter", "PartitionRouter"] diff --git a/airbyte_cdk/sources/declarative/resolvers/__init__.py b/airbyte_cdk/sources/declarative/resolvers/__init__.py new file mode 100644 index 000000000..8dcbaf2e2 --- /dev/null +++ b/airbyte_cdk/sources/declarative/resolvers/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.resolvers.components_resolver import ComponentsResolver, ComponentMappingDefinition, ResolvedComponentMappingDefinition +from airbyte_cdk.sources.declarative.resolvers.http_components_resolver import HttpComponentsResolver + +__all__ = ["ComponentsResolver", "HttpComponentsResolver", "ComponentMappingDefinition", "ResolvedComponentMappingDefinition"] diff --git a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py new file mode 100644 index 000000000..69303cc02 --- /dev/null +++ b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import InitVar, dataclass +from typing import Any, Dict, Mapping, Optional, Type, Union, Iterable + +from airbyte_cdk.sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString + + +@dataclass(frozen=True) +class ComponentMappingDefinition: + """Defines the key-value mapping configuration for a stream component.""" + + key: str + value: Union["InterpolatedString", str] + value_type: Optional[Type[Any]] + parameters: InitVar[Mapping[str, Any]] + condition: str = "" + + +@dataclass(frozen=True) +class ResolvedComponentMappingDefinition: + """Represents a parsed and resolved component mapping for a stream configuration.""" + + key: str + value: Union["InterpolatedString", str] + value_type: Optional[Type[Any]] + parameters: InitVar[Mapping[str, Any]] + condition: Optional[Union["InterpolatedBoolean", str]] = "" + + +@dataclass +class ComponentsResolver: + """ + Abstract base class for resolving components in a stream template. + """ + + @abstractmethod + def resolve_components( + self, stream_template_config: Dict[str, Any] + ) -> Iterable[Dict[str, Any]]: + """ + Maps and populates values into a stream template configuration. + :param stream_template_config: The stream template with placeholders for components. + :yields: The resolved stream config with populated values. + """ + pass diff --git a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py new file mode 100644 index 000000000..5abe8dd4e --- /dev/null +++ b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py @@ -0,0 +1,140 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from copy import deepcopy +from dataclasses import InitVar, dataclass, field +from typing import Any, Dict, List, Mapping, Optional, Iterable, Union + +from airbyte_cdk.sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.types import Config +from airbyte_cdk.sources.declarative.resolvers.components_resolver import ( + ComponentsResolver, + ComponentMappingDefinition, + ResolvedComponentMappingDefinition, +) + + +@dataclass +class HttpComponentsResolver(ComponentsResolver): + """ + Resolves and populates stream templates with components fetched via an HTTP retriever. + + Attributes: + retriever (Retriever): The retriever used to fetch data from an API. + config (Config): Configuration object for the resolver. + components_mapping (List[ComponentMappingDefinition]): List of mappings to resolve. + parameters (InitVar[Mapping[str, Any]]): Additional parameters for interpolation. + """ + + retriever: Retriever + config: Config + components_mapping: List[ComponentMappingDefinition] + parameters: InitVar[Mapping[str, Any]] + _resolved_components: List[ResolvedComponentMappingDefinition] = field( + init=False, repr=False, default_factory=list + ) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + """ + Initializes and parses component mappings, converting them to resolved definitions. + + Args: + parameters (Mapping[str, Any]): Parameters for interpolation. + """ + for component_mapping in self.components_mapping: + condition = component_mapping.condition or True + + if isinstance(component_mapping.value, (str, InterpolatedString)): + interpolated_value = ( + InterpolatedString.create(component_mapping.value, parameters=parameters) + if isinstance(component_mapping.value, str) + else component_mapping.value + ) + self._resolved_components.append( + ResolvedComponentMappingDefinition( + key=component_mapping.key, + value=interpolated_value, + value_type=component_mapping.value_type, + condition=InterpolatedBoolean(condition=condition, parameters=parameters), + parameters=parameters, + ) + ) + else: + raise ValueError( + f"Expected a string or InterpolatedString for value in mapping: {component_mapping}" + ) + + def _update_config( + self, + component_config: Dict[str, Any], + key: str, + value: Any, + condition: Optional[Union[InterpolatedBoolean, str]], + **kwargs, + ) -> Dict[str, Any]: + """ + Recursively updates the configuration dictionary for a specific key. + + Args: + component_config (Dict[str, Any]): Component config to update. + key (str): Target key to update. + value (Any): Value to assign to the target key. + condition (Optional[InterpolatedBoolean]): Condition for applying the update. + + Returns: + Dict[str, Any]: Updated configuration dictionary. + """ + kwargs["current_component_config"] = component_config + should_update = condition.eval(self.config, **kwargs) if condition else True + + for key, value in component_config.items(): + if key == key and should_update: + component_config[key] = value + elif isinstance(value, dict): + component_config[key] = self._update_config(value, key, value, condition, **kwargs) + elif isinstance(value, list): + component_config[key] = [ + self._update_config(item, key, value, condition, **kwargs) + if isinstance(item, dict) + else item + for item in value + ] + + return component_config + + def resolve_components( + self, stream_template_config: Dict[str, Any] + ) -> Iterable[Dict[str, Any]]: + """ + Resolves components in the stream template configuration by populating values. + + Args: + stream_template_config (Dict[str, Any]): Stream template to populate. + + Yields: + Dict[str, Any]: Updated configurations with resolved components. + """ + kwargs = {"stream_template_config": stream_template_config} + + for components_values in self.retriever.read_records({}): + updated_config = deepcopy(stream_template_config) + kwargs["components_values"] = components_values + + for resolved_component in self._resolved_components: + valid_types = ( + (resolved_component.value_type,) if resolved_component.value_type else None + ) + value = resolved_component.value.eval( + self.config, valid_types=valid_types, **kwargs + ) + updated_config = self._update_config( + updated_config, + key=resolved_component.key, + value=value, + condition=resolved_component.condition, + **kwargs, + ) + + yield updated_config From c12f3511b532c3b2742f1179c06ee2273669eb09 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 26 Nov 2024 04:22:06 +0100 Subject: [PATCH 02/25] Fix mypy --- .../declarative/resolvers/components_resolver.py | 2 +- .../declarative/resolvers/http_components_resolver.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py index 69303cc02..002d4767b 100644 --- a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py @@ -25,7 +25,7 @@ class ResolvedComponentMappingDefinition: """Represents a parsed and resolved component mapping for a stream configuration.""" key: str - value: Union["InterpolatedString", str] + value: "InterpolatedString" value_type: Optional[Type[Any]] parameters: InitVar[Mapping[str, Any]] condition: Optional[Union["InterpolatedBoolean", str]] = "" diff --git a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py index 5abe8dd4e..5a2b71b77 100644 --- a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py @@ -44,7 +44,7 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: parameters (Mapping[str, Any]): Parameters for interpolation. """ for component_mapping in self.components_mapping: - condition = component_mapping.condition or True + condition = component_mapping.condition or "True" if isinstance(component_mapping.value, (str, InterpolatedString)): interpolated_value = ( @@ -71,8 +71,8 @@ def _update_config( component_config: Dict[str, Any], key: str, value: Any, - condition: Optional[Union[InterpolatedBoolean, str]], - **kwargs, + condition: Optional[InterpolatedBoolean], + **kwargs: Any, ) -> Dict[str, Any]: """ Recursively updates the configuration dictionary for a specific key. @@ -120,7 +120,7 @@ def resolve_components( for components_values in self.retriever.read_records({}): updated_config = deepcopy(stream_template_config) - kwargs["components_values"] = components_values + kwargs["components_values"] = components_values # type: ignore[assignment] # component_values will always be of type Mapping[str, Any] for resolved_component in self._resolved_components: valid_types = ( @@ -133,7 +133,7 @@ def resolve_components( updated_config, key=resolved_component.key, value=value, - condition=resolved_component.condition, + condition=resolved_component.condition, # type: ignore[arg-type] # The condition in resolved_component always has the type InterpolatedBoolean if it exists. **kwargs, ) From e2505b151ac3b836372fd894f500f6d73274a1ba Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 26 Nov 2024 04:26:55 +0100 Subject: [PATCH 03/25] Fix formatting --- .../sources/declarative/resolvers/http_components_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py index 5a2b71b77..9395c3e2a 100644 --- a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py @@ -4,7 +4,7 @@ from copy import deepcopy from dataclasses import InitVar, dataclass, field -from typing import Any, Dict, List, Mapping, Optional, Iterable, Union +from typing import Any, Dict, List, Mapping, Optional, Iterable from airbyte_cdk.sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever From c8e350972db564a89f9cfa043f498a438450defd Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Wed, 27 Nov 2024 00:34:48 +0100 Subject: [PATCH 04/25] Added dynamic stream component --- .../concurrent_declarative_source.py | 3 +- .../declarative_component_schema.yaml | 11 ++- .../manifest_declarative_source.py | 54 +++++++++++- .../models/declarative_component_schema.py | 85 +++++++++++-------- .../sources/declarative/resolvers/__init__.py | 7 +- .../resolvers/components_resolver.py | 4 +- .../resolvers/http_components_resolver.py | 3 + 7 files changed, 126 insertions(+), 41 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 001740a35..02b7865d6 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -181,7 +181,8 @@ def _group_streams( state_manager = ConnectorStateManager(state=self._state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later name_to_stream_mapping = { - stream["name"]: stream for stream in self.resolved_manifest["streams"] + stream["name"]: stream + for stream in self._stream_configs(self.resolved_manifest, config) } for declarative_stream in self.streams(config=config): diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index cfba4f3ae..7786cde4f 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -19,6 +19,10 @@ properties: type: array items: "$ref": "#/definitions/DeclarativeStream" + dynamic_streams: + type: array + items: + "$ref": "#/definitions/DynamicDeclarativeStream" version: type: string description: The version of the Airbyte CDK used to build and test the source. @@ -2702,7 +2706,7 @@ definitions: additionalProperties: true ComponentMappingDefinition: title: Component Mapping Definition - description: Specifies a mapping definition to update or add fields in a record or configuration. This allows dynamic mapping of data by interpolating values into the template based on provided contexts. + description: Specifies a mapping definition to update or add fields in a record or configuration. This allows dynamic mapping of data by interpolating values into the template based on provided contexts. (This component is experimental. Use at your own risk.) type: object required: - type @@ -2749,6 +2753,7 @@ definitions: additionalProperties: true HttpComponentsResolver: type: object + description: Component resolve and populates stream templates with components fetched via an HTTP retriever. (This component is experimental. Use at your own risk.) properties: type: type: string @@ -2773,15 +2778,17 @@ definitions: - components_mapping DynamicDeclarativeStream: type: object + description: A component that described how will be created declarative streams based on stream template. (This component is experimental. Use at your own risk.) properties: stream_template: title: Stream Template description: Reference to the stream template. "$ref": "#/definitions/DeclarativeStream" components_resolver: + title: Components Resolver + description: Component resolve and populates stream templates with components values. anyOf: - "$ref": "#/definitions/HttpComponentsResolver" - - "$ref": "#/definitions/ConfigDrivenComponentsParser" interpolation: variables: - title: config diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index 223cbc0b6..c75f80ac2 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -30,6 +30,8 @@ from airbyte_cdk.sources.declarative.parsers.manifest_component_transformer import ( ManifestComponentTransformer, ) +from airbyte_cdk.sources.declarative.resolvers import COMPONENTS_RESOLVER_TYPE_MAPPING + from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ( ManifestReferenceResolver, ) @@ -119,7 +121,7 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: self._emit_manifest_debug_message( extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)} ) - stream_configs = self._stream_configs(self._source_config) + stream_configs = self._stream_configs(self._source_config, config) source_streams = [ self._constructor.create_component( @@ -294,13 +296,61 @@ def _parse_version( # No exception return parsed_version - def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: + def _stream_configs( + self, manifest: Mapping[str, Any], config: Mapping[str, Any] + ) -> List[Dict[str, Any]]: # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config stream_configs: List[Dict[str, Any]] = manifest.get("streams", []) + + # Add dynamic stream configs to the common stream configs + stream_configs.extend(self._dynamic_stream_configs(manifest, config)) + for s in stream_configs: if "type" not in s: s["type"] = "DeclarativeStream" return stream_configs + def _dynamic_stream_configs( + self, manifest: Mapping[str, Any], config: Mapping[str, Any] + ) -> List[Dict[str, Any]]: + dynamic_stream_definitions: List[Dict[str, Any]] = manifest.get("dynamic_streams", []) + dynamic_stream_configs: List[Dict[str, Any]] = [] + + for dynamic_definition in dynamic_stream_definitions: + components_resolver_config = dynamic_definition["components_resolver"] + + if not components_resolver_config: + raise ValueError( + f"Missing 'components_resolver' in dynamic definition: {dynamic_definition}" + ) + + resolver_type = components_resolver_config.get("type") + if not resolver_type: + raise ValueError( + f"Missing 'type' in components resolver configuration: {components_resolver_config}" + ) + + if resolver_type not in COMPONENTS_RESOLVER_TYPE_MAPPING: + raise ValueError( + f"Invalid components resolver type '{resolver_type}'. " + f"Expected one of {list(COMPONENTS_RESOLVER_TYPE_MAPPING.keys())}." + ) + + # Create a resolver for dynamic components based on type + components_resolver = self._constructor.create_component( + COMPONENTS_RESOLVER_TYPE_MAPPING[resolver_type], components_resolver_config, config + ) + + stream_template_config = dynamic_definition["stream_template"] + dynamic_stream_configs.extend( + list( + components_resolver.resolve_components( + stream_template_config=stream_template_config + ) + ) + ) + + return dynamic_stream_configs + def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None: self.logger.debug("declarative source created from manifest", extra=extra_args) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index bd8f86eab..80af4a7d3 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -528,7 +528,9 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], + examples=[ + ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] + ], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -752,19 +754,21 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( + Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", + ) ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -782,7 +786,9 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], + examples=[ + {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} + ], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1065,10 +1071,6 @@ class ComponentMappingDefinition(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") -class ConfigDrivenComponentsParser(BaseModel): - __root__: Any - - class AddedFieldDefinition(BaseModel): type: Literal["AddedFieldDefinition"] path: List[str] = Field( @@ -1373,6 +1375,7 @@ class Config: type: Literal["DeclarativeSource"] check: CheckStream streams: List[DeclarativeStream] + dynamic_streams: Optional[List[DynamicDeclarativeStream]] = None version: str = Field( ..., description="The version of the Airbyte CDK used to build and test the source.", @@ -1442,21 +1445,25 @@ class Config: description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( + Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", + ) + ) + name: Optional[str] = Field( + "", description="The stream name.", example=["Users"], title="Name" ) - name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) - schema_loader: Optional[Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader]] = ( - Field( - None, - description="Component used to retrieve the schema for the current stream.", - title="Schema Loader", - ) + schema_loader: Optional[ + Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] + ] = Field( + None, + description="Component used to retrieve the schema for the current stream.", + title="Schema Loader", ) transformations: Optional[ List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] @@ -1674,7 +1681,11 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -1743,7 +1754,11 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -1792,8 +1807,10 @@ class DynamicDeclarativeStream(BaseModel): stream_template: Optional[DeclarativeStream] = Field( None, description="Reference to the stream template.", title="Stream Template" ) - components_resolver: Optional[Union[HttpComponentsResolver, ConfigDrivenComponentsParser]] = ( - None + components_resolver: Optional[HttpComponentsResolver] = Field( + None, + description="Component resolve and populates stream templates with components values.", + title="Components Resolver", ) diff --git a/airbyte_cdk/sources/declarative/resolvers/__init__.py b/airbyte_cdk/sources/declarative/resolvers/__init__.py index 8dcbaf2e2..17a3b5d52 100644 --- a/airbyte_cdk/sources/declarative/resolvers/__init__.py +++ b/airbyte_cdk/sources/declarative/resolvers/__init__.py @@ -4,5 +4,10 @@ from airbyte_cdk.sources.declarative.resolvers.components_resolver import ComponentsResolver, ComponentMappingDefinition, ResolvedComponentMappingDefinition from airbyte_cdk.sources.declarative.resolvers.http_components_resolver import HttpComponentsResolver +from airbyte_cdk.sources.declarative.models import HttpComponentsResolver as HttpComponentsResolverModel -__all__ = ["ComponentsResolver", "HttpComponentsResolver", "ComponentMappingDefinition", "ResolvedComponentMappingDefinition"] +COMPONENTS_RESOLVER_TYPE_MAPPING = { + "HttpComponentsResolver": HttpComponentsResolverModel +} + +__all__ = ["ComponentsResolver", "HttpComponentsResolver", "ComponentMappingDefinition", "ResolvedComponentMappingDefinition", "COMPONENTS_RESOLVER_TYPE_MAPPING"] diff --git a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py index 002d4767b..827a28097 100644 --- a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py @@ -5,8 +5,9 @@ from abc import abstractmethod from dataclasses import InitVar, dataclass from typing import Any, Dict, Mapping, Optional, Type, Union, Iterable - +from airbyte_cdk.sources.source import ExperimentalClassWarning from airbyte_cdk.sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString +from deprecated.classic import deprecated @dataclass(frozen=True) @@ -31,6 +32,7 @@ class ResolvedComponentMappingDefinition: condition: Optional[Union["InterpolatedBoolean", str]] = "" +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) @dataclass class ComponentsResolver: """ diff --git a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py index 9395c3e2a..1b6220359 100644 --- a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py @@ -8,14 +8,17 @@ from airbyte_cdk.sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.source import ExperimentalClassWarning from airbyte_cdk.sources.types import Config from airbyte_cdk.sources.declarative.resolvers.components_resolver import ( ComponentsResolver, ComponentMappingDefinition, ResolvedComponentMappingDefinition, ) +from deprecated.classic import deprecated +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) @dataclass class HttpComponentsResolver(ComponentsResolver): """ From 6e3ececf493de736846750a21fbada961f1ecf0d Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Wed, 27 Nov 2024 00:43:56 +0100 Subject: [PATCH 05/25] Fix model --- .../sources/declarative/declarative_component_schema.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 7786cde4f..74901fe53 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -2787,8 +2787,7 @@ definitions: components_resolver: title: Components Resolver description: Component resolve and populates stream templates with components values. - anyOf: - - "$ref": "#/definitions/HttpComponentsResolver" + "$ref": "#/definitions/HttpComponentsResolver" interpolation: variables: - title: config From b1365244b00c9d85e8439c47dcefcd068ec23681 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Wed, 27 Nov 2024 01:30:21 +0100 Subject: [PATCH 06/25] Add unit tests --- .../models/declarative_component_schema.py | 74 +++++++----------- .../resolvers/http_components_resolver.py | 12 +-- .../sources/declarative/resolvers/__init__.py | 3 + .../test_http_components_resolver.py | 75 +++++++++++++++++++ 4 files changed, 112 insertions(+), 52 deletions(-) create mode 100644 unit_tests/sources/declarative/resolvers/__init__.py create mode 100644 unit_tests/sources/declarative/resolvers/test_http_components_resolver.py diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 80af4a7d3..7f15410de 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -528,9 +528,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -754,21 +752,19 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -786,9 +782,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1445,25 +1439,21 @@ class Config: description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( - Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", - ) - ) - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) - schema_loader: Optional[ - Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] - ] = Field( - None, - description="Component used to retrieve the schema for the current stream.", - title="Schema Loader", + schema_loader: Optional[Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader]] = ( + Field( + None, + description="Component used to retrieve the schema for the current stream.", + title="Schema Loader", + ) ) transformations: Optional[ List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] @@ -1681,11 +1671,7 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -1754,11 +1740,7 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], diff --git a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py index 1b6220359..900a09d39 100644 --- a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py @@ -72,8 +72,8 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: def _update_config( self, component_config: Dict[str, Any], - key: str, - value: Any, + target_key: str, + target_value: Any, condition: Optional[InterpolatedBoolean], **kwargs: Any, ) -> Dict[str, Any]: @@ -93,8 +93,8 @@ def _update_config( should_update = condition.eval(self.config, **kwargs) if condition else True for key, value in component_config.items(): - if key == key and should_update: - component_config[key] = value + if key == target_key and should_update: + component_config[key] = target_value elif isinstance(value, dict): component_config[key] = self._update_config(value, key, value, condition, **kwargs) elif isinstance(value, list): @@ -134,8 +134,8 @@ def resolve_components( ) updated_config = self._update_config( updated_config, - key=resolved_component.key, - value=value, + target_key=resolved_component.key, + target_value=value, condition=resolved_component.condition, # type: ignore[arg-type] # The condition in resolved_component always has the type InterpolatedBoolean if it exists. **kwargs, ) diff --git a/unit_tests/sources/declarative/resolvers/__init__.py b/unit_tests/sources/declarative/resolvers/__init__.py new file mode 100644 index 000000000..66f6de8cb --- /dev/null +++ b/unit_tests/sources/declarative/resolvers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# diff --git a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py new file mode 100644 index 000000000..7591d36b9 --- /dev/null +++ b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import pytest +from unittest.mock import MagicMock +from airbyte_cdk.sources.declarative.resolvers import ( + ComponentMappingDefinition, + HttpComponentsResolver, +) + + +@pytest.mark.parametrize( + "components_mapping, retriever_data, stream_template_config, expected_result", + [ + ( + [ + ComponentMappingDefinition( + key="key1", + value="{{components_values['key1']}}", + value_type=str, + condition="True", + parameters={}, + ), + ComponentMappingDefinition( + key="key2", + value="{{components_values['key2']}}", + value_type=str, + condition="False", + parameters={}, + ), + ], + [{"key1": "updated_value1", "key2": "updated_value2"}], + {"key1": None, "key2": None}, + [{"key1": "updated_value1", "key2": None}], # Only key1 is updated + ), + ( + [ + ComponentMappingDefinition( + key="key3", + value="{{components_values['key3']}}", + value_type=str, + condition="True", + parameters={}, + ), + ], + [{"key3": "updated_value3"}], + {"key3": None}, + [{"key3": "updated_value3"}], # key3 is updated + ), + ], +) +def test_http_components_resolver( + components_mapping, retriever_data, stream_template_config, expected_result +): + # Mock the retriever to simulate reading records + mock_retriever = MagicMock() + mock_retriever.read_records.return_value = retriever_data + + # Use a simple dictionary for the config, as Config should be a Mapping + config = {} + + # Instantiate the resolver with mocked data + resolver = HttpComponentsResolver( + retriever=mock_retriever, + config=config, + components_mapping=components_mapping, + parameters={}, + ) + + # Run the resolve_components method and convert the result to a list + result = list(resolver.resolve_components(stream_template_config=stream_template_config)) + + # Assert the resolved components match the expected result + assert result == expected_result From 1d83663109f47754a6074a4ffcc880870f83bde7 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Fri, 29 Nov 2024 01:53:13 +0100 Subject: [PATCH 07/25] Replace key with field_path and update according to review --- .../concurrent_declarative_source.py | 11 ++-- .../declarative_component_schema.yaml | 41 ++++++------ .../manifest_declarative_source.py | 31 +++++---- .../models/declarative_component_schema.py | 63 ++++++++++++++---- .../parsers/model_to_component_factory.py | 7 +- .../resolvers/components_resolver.py | 14 ++-- .../resolvers/http_components_resolver.py | 64 ++++--------------- .../test_http_components_resolver.py | 26 +------- 8 files changed, 119 insertions(+), 138 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 02b7865d6..0181cb34a 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -180,10 +180,13 @@ def _group_streams( state_manager = ConnectorStateManager(state=self._state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later - name_to_stream_mapping = { - stream["name"]: stream - for stream in self._stream_configs(self.resolved_manifest, config) - } + # Combine streams and dynamic_streams. Note: both cannot be empty at the same time, + # and this is validated during the initialization of the source. + streams = self.resolved_manifest.get("streams", []) + self.resolved_manifest.get( + "dynamic_streams", [] + ) + + name_to_stream_mapping = {stream["name"]: stream for stream in streams} for declarative_stream in self.streams(config=config): # Some low-code sources use a combination of DeclarativeStream and regular Python streams. We can't inspect diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 74901fe53..0c31895e2 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -7,8 +7,12 @@ version: 1.0.0 required: - type - check - - streams - version +anyOf: + - required: + - streams + - required: + - dynamic_streams properties: type: type: string @@ -2706,20 +2710,31 @@ definitions: additionalProperties: true ComponentMappingDefinition: title: Component Mapping Definition - description: Specifies a mapping definition to update or add fields in a record or configuration. This allows dynamic mapping of data by interpolating values into the template based on provided contexts. (This component is experimental. Use at your own risk.) + description: (This component is experimental. Use at your own risk.) Specifies a mapping definition to update or add fields in a record or configuration. This allows dynamic mapping of data by interpolating values into the template based on provided contexts. type: object required: - type - - key + - field_path - value properties: type: type: string enum: [ComponentMappingDefinition] - key: - title: Key - description: The target key in the stream template where the value will be added or updated. - type: string + field_path: + title: Field Path + description: A list of potentially nested fields indicating the full path where value will be added or updated. + type: array + items: + - type: string + interpolation_content: + - config + - components_values + - stream_template_config + examples: + - ["data"] + - ["data", "records"] + - ["data", "{{ parameters.name }}"] + - ["data", "*", "record"] value: title: Value description: The dynamic or static value to assign to the key. Interpolated values can be used to dynamically determine the value during runtime. @@ -2736,18 +2751,6 @@ definitions: title: Value Type description: The expected data type of the value. If omitted, the type will be inferred from the value provided. "$ref": "#/definitions/ValueType" - condition: - title: Condition - description: An optional condition that must evaluate to `true` for the mapping to be applied. This can use interpolation for dynamic evaluation. - type: string - default: "" - interpolation_context: - - config - - stream_template_config - - components_values - examples: - - "{{ components_values['created_at'] >= stream_interval['start_time'] }}" - - "{{ components_values.status in ['active', 'expired'] }}" $parameters: type: object additionalProperties: true diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index c75f80ac2..ebf149e56 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -121,7 +121,10 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: self._emit_manifest_debug_message( extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)} ) - stream_configs = self._stream_configs(self._source_config, config) + + stream_configs = self._stream_configs(self._source_config) + self._dynamic_stream_configs( + self._source_config, config + ) source_streams = [ self._constructor.create_component( @@ -235,7 +238,8 @@ def _validate_source(self) -> None: ) streams = self._source_config.get("streams") - if not streams: + dynamic_streams = self._source_config.get("dynamic_streams") + if not (streams or dynamic_streams): raise ValidationError( f"A valid manifest should have at least one stream defined. Got {streams}" ) @@ -296,15 +300,9 @@ def _parse_version( # No exception return parsed_version - def _stream_configs( - self, manifest: Mapping[str, Any], config: Mapping[str, Any] - ) -> List[Dict[str, Any]]: + def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config stream_configs: List[Dict[str, Any]] = manifest.get("streams", []) - - # Add dynamic stream configs to the common stream configs - stream_configs.extend(self._dynamic_stream_configs(manifest, config)) - for s in stream_configs: if "type" not in s: s["type"] = "DeclarativeStream" @@ -342,13 +340,14 @@ def _dynamic_stream_configs( ) stream_template_config = dynamic_definition["stream_template"] - dynamic_stream_configs.extend( - list( - components_resolver.resolve_components( - stream_template_config=stream_template_config - ) - ) - ) + + for dynamic_stream in components_resolver.resolve_components( + stream_template_config=stream_template_config + ): + if "type" not in dynamic_stream: + dynamic_stream["type"] = "DeclarativeStream" + + dynamic_stream_configs.append(dynamic_stream) return dynamic_stream_configs diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 7f15410de..83f8aecae 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -1033,10 +1033,16 @@ class WaitUntilTimeFromHeader(BaseModel): class ComponentMappingDefinition(BaseModel): type: Literal["ComponentMappingDefinition"] - key: str = Field( + field_path: List[str] = Field( ..., - description="The target key in the stream template where the value will be added or updated.", - title="Key", + description="A list of potentially nested fields indicating the full path where value will be added or updated.", + examples=[ + ["data"], + ["data", "records"], + ["data", "{{ parameters.name }}"], + ["data", "*", "record"], + ], + title="Field Path", ) value: str = Field( ..., @@ -1053,15 +1059,6 @@ class ComponentMappingDefinition(BaseModel): description="The expected data type of the value. If omitted, the type will be inferred from the value provided.", title="Value Type", ) - condition: Optional[str] = Field( - "", - description="An optional condition that must evaluate to `true` for the mapping to be applied. This can use interpolation for dynamic evaluation.", - examples=[ - "{{ components_values['created_at'] >= stream_interval['start_time'] }}", - "{{ components_values.status in ['active', 'expired'] }}", - ], - title="Condition", - ) parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") @@ -1362,7 +1359,7 @@ class CompositeErrorHandler(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") -class DeclarativeSource(BaseModel): +class DeclarativeSource1(BaseModel): class Config: extra = Extra.forbid @@ -1388,6 +1385,43 @@ class Config: ) +class DeclarativeSource2(BaseModel): + class Config: + extra = Extra.forbid + + type: Literal["DeclarativeSource"] + check: CheckStream + streams: Optional[List[DeclarativeStream]] = None + dynamic_streams: List[DynamicDeclarativeStream] + version: str = Field( + ..., + description="The version of the Airbyte CDK used to build and test the source.", + ) + schemas: Optional[Schemas] = None + definitions: Optional[Dict[str, Any]] = None + spec: Optional[Spec] = None + concurrency_level: Optional[ConcurrencyLevel] = None + metadata: Optional[Dict[str, Any]] = Field( + None, + description="For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.", + ) + description: Optional[str] = Field( + None, + description="A description of the connector. It will be presented on the Source documentation page.", + ) + + +class DeclarativeSource(BaseModel): + class Config: + extra = Extra.forbid + + __root__: Union[DeclarativeSource1, DeclarativeSource2] = Field( + ..., + description="An API source that extracts data according to its declarative components.", + title="DeclarativeSource", + ) + + class SelectiveAuthenticator(BaseModel): class Config: extra = Extra.allow @@ -1797,7 +1831,8 @@ class DynamicDeclarativeStream(BaseModel): CompositeErrorHandler.update_forward_refs() -DeclarativeSource.update_forward_refs() +DeclarativeSource1.update_forward_refs() +DeclarativeSource2.update_forward_refs() SelectiveAuthenticator.update_forward_refs() DeclarativeStream.update_forward_refs() SessionTokenAuthenticator.update_forward_refs() diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index c81d901d2..58df95c10 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2211,11 +2211,14 @@ def create_components_mapping_definition( interpolated_value = InterpolatedString.create( model.value, parameters=model.parameters or {} ) + field_path = [ + InterpolatedString.create(path, parameters=model.parameters or {}) + for path in model.field_path + ] return ComponentMappingDefinition( - key=model.key, + field_path=field_path, value=interpolated_value, value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), - condition=model.condition or "", parameters=model.parameters or {}, ) diff --git a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py index 827a28097..af1d76345 100644 --- a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py @@ -2,11 +2,11 @@ # Copyright (c) 2024 Airbyte, Inc., all rights reserved. # -from abc import abstractmethod +from abc import ABC, abstractmethod from dataclasses import InitVar, dataclass -from typing import Any, Dict, Mapping, Optional, Type, Union, Iterable +from typing import Any, Dict, Mapping, Optional, Type, Union, Iterable, List from airbyte_cdk.sources.source import ExperimentalClassWarning -from airbyte_cdk.sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString from deprecated.classic import deprecated @@ -14,27 +14,25 @@ class ComponentMappingDefinition: """Defines the key-value mapping configuration for a stream component.""" - key: str + field_path: List["InterpolatedString"] value: Union["InterpolatedString", str] value_type: Optional[Type[Any]] parameters: InitVar[Mapping[str, Any]] - condition: str = "" @dataclass(frozen=True) class ResolvedComponentMappingDefinition: """Represents a parsed and resolved component mapping for a stream configuration.""" - key: str + field_path: List["InterpolatedString"] value: "InterpolatedString" value_type: Optional[Type[Any]] parameters: InitVar[Mapping[str, Any]] - condition: Optional[Union["InterpolatedBoolean", str]] = "" @deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) @dataclass -class ComponentsResolver: +class ComponentsResolver(ABC): """ Abstract base class for resolving components in a stream template. """ diff --git a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py index 900a09d39..c3686422d 100644 --- a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py @@ -4,9 +4,10 @@ from copy import deepcopy from dataclasses import InitVar, dataclass, field -from typing import Any, Dict, List, Mapping, Optional, Iterable +from typing import Any, Dict, List, Mapping, Iterable -from airbyte_cdk.sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString +import dpath +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever from airbyte_cdk.sources.source import ExperimentalClassWarning from airbyte_cdk.sources.types import Config @@ -47,20 +48,23 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: parameters (Mapping[str, Any]): Parameters for interpolation. """ for component_mapping in self.components_mapping: - condition = component_mapping.condition or "True" - if isinstance(component_mapping.value, (str, InterpolatedString)): interpolated_value = ( InterpolatedString.create(component_mapping.value, parameters=parameters) if isinstance(component_mapping.value, str) else component_mapping.value ) + + field_path = [ + InterpolatedString.create(path, parameters=parameters) + for path in component_mapping.field_path + ] + self._resolved_components.append( ResolvedComponentMappingDefinition( - key=component_mapping.key, + field_path=field_path, value=interpolated_value, value_type=component_mapping.value_type, - condition=InterpolatedBoolean(condition=condition, parameters=parameters), parameters=parameters, ) ) @@ -69,44 +73,6 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: f"Expected a string or InterpolatedString for value in mapping: {component_mapping}" ) - def _update_config( - self, - component_config: Dict[str, Any], - target_key: str, - target_value: Any, - condition: Optional[InterpolatedBoolean], - **kwargs: Any, - ) -> Dict[str, Any]: - """ - Recursively updates the configuration dictionary for a specific key. - - Args: - component_config (Dict[str, Any]): Component config to update. - key (str): Target key to update. - value (Any): Value to assign to the target key. - condition (Optional[InterpolatedBoolean]): Condition for applying the update. - - Returns: - Dict[str, Any]: Updated configuration dictionary. - """ - kwargs["current_component_config"] = component_config - should_update = condition.eval(self.config, **kwargs) if condition else True - - for key, value in component_config.items(): - if key == target_key and should_update: - component_config[key] = target_value - elif isinstance(value, dict): - component_config[key] = self._update_config(value, key, value, condition, **kwargs) - elif isinstance(value, list): - component_config[key] = [ - self._update_config(item, key, value, condition, **kwargs) - if isinstance(item, dict) - else item - for item in value - ] - - return component_config - def resolve_components( self, stream_template_config: Dict[str, Any] ) -> Iterable[Dict[str, Any]]: @@ -132,12 +98,8 @@ def resolve_components( value = resolved_component.value.eval( self.config, valid_types=valid_types, **kwargs ) - updated_config = self._update_config( - updated_config, - target_key=resolved_component.key, - target_value=value, - condition=resolved_component.condition, # type: ignore[arg-type] # The condition in resolved_component always has the type InterpolatedBoolean if it exists. - **kwargs, - ) + + path = [path.eval(self.config, **kwargs) for path in resolved_component.field_path] + dpath.set(updated_config, path, value) yield updated_config diff --git a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py index 7591d36b9..13ad03442 100644 --- a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py @@ -16,38 +16,16 @@ ( [ ComponentMappingDefinition( - key="key1", + field_path=["key1"], value="{{components_values['key1']}}", value_type=str, - condition="True", - parameters={}, - ), - ComponentMappingDefinition( - key="key2", - value="{{components_values['key2']}}", - value_type=str, - condition="False", parameters={}, ), ], [{"key1": "updated_value1", "key2": "updated_value2"}], {"key1": None, "key2": None}, [{"key1": "updated_value1", "key2": None}], # Only key1 is updated - ), - ( - [ - ComponentMappingDefinition( - key="key3", - value="{{components_values['key3']}}", - value_type=str, - condition="True", - parameters={}, - ), - ], - [{"key3": "updated_value3"}], - {"key3": None}, - [{"key3": "updated_value3"}], # key3 is updated - ), + ) ], ) def test_http_components_resolver( From c30b43fe42e7e01d8396a4f141b7b023d2d933cb Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Fri, 29 Nov 2024 02:04:20 +0100 Subject: [PATCH 08/25] Update source schema --- .../declarative_component_schema.yaml | 5 +- .../models/declarative_component_schema.py | 82 +++++++++++-------- 2 files changed, 54 insertions(+), 33 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 0c31895e2..77942a0f4 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -2771,7 +2771,7 @@ definitions: components_mapping: type: array items: - - "$ref": "#/definitions/ComponentMappingDefinition" + "$ref": "#/definitions/ComponentMappingDefinition" $parameters: type: object additionalProperties: true @@ -2791,6 +2791,9 @@ definitions: title: Components Resolver description: Component resolve and populates stream templates with components values. "$ref": "#/definitions/HttpComponentsResolver" + required: + - stream_template + - components_resolver interpolation: variables: - title: config diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 83f8aecae..6da5ac83f 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -528,7 +528,9 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], + examples=[ + ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] + ], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -752,19 +754,21 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( + Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", + ) ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -782,7 +786,9 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], + examples=[ + {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} + ], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1473,21 +1479,25 @@ class Config: description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( + Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", + ) + ) + name: Optional[str] = Field( + "", description="The stream name.", example=["Users"], title="Name" ) - name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) - schema_loader: Optional[Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader]] = ( - Field( - None, - description="Component used to retrieve the schema for the current stream.", - title="Schema Loader", - ) + schema_loader: Optional[ + Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] + ] = Field( + None, + description="Component used to retrieve the schema for the current stream.", + title="Schema Loader", ) transformations: Optional[ List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] @@ -1705,7 +1715,11 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -1774,7 +1788,11 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -1820,11 +1838,11 @@ class HttpComponentsResolver(BaseModel): class DynamicDeclarativeStream(BaseModel): - stream_template: Optional[DeclarativeStream] = Field( - None, description="Reference to the stream template.", title="Stream Template" + stream_template: DeclarativeStream = Field( + ..., description="Reference to the stream template.", title="Stream Template" ) - components_resolver: Optional[HttpComponentsResolver] = Field( - None, + components_resolver: HttpComponentsResolver = Field( + ..., description="Component resolve and populates stream templates with components values.", title="Components Resolver", ) From d0d7107ac61ce52cc61becb3afde94782dcd93af Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Fri, 29 Nov 2024 02:07:16 +0100 Subject: [PATCH 09/25] Fix dynamic declarative stream schema --- .../sources/declarative/declarative_component_schema.yaml | 4 ++++ .../declarative/models/declarative_component_schema.py | 1 + 2 files changed, 5 insertions(+) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 77942a0f4..697d13961 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -2783,6 +2783,9 @@ definitions: type: object description: A component that described how will be created declarative streams based on stream template. (This component is experimental. Use at your own risk.) properties: + type: + type: string + enum: [ DynamicDeclarativeStream ] stream_template: title: Stream Template description: Reference to the stream template. @@ -2792,6 +2795,7 @@ definitions: description: Component resolve and populates stream templates with components values. "$ref": "#/definitions/HttpComponentsResolver" required: + - type - stream_template - components_resolver interpolation: diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 6da5ac83f..e3adfadcf 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -1838,6 +1838,7 @@ class HttpComponentsResolver(BaseModel): class DynamicDeclarativeStream(BaseModel): + type: Literal["DynamicDeclarativeStream"] stream_template: DeclarativeStream = Field( ..., description="Reference to the stream template.", title="Stream Template" ) From f6542ec11488bb9b8860c5097d6244454f24890b Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Fri, 29 Nov 2024 18:51:57 +0100 Subject: [PATCH 10/25] Added unittets for dyanimc stream read --- .../declarative_component_schema.yaml | 2 +- .../parsers/model_to_component_factory.py | 2 +- .../test_http_components_resolver.py | 135 ++++++++++++++++-- 3 files changed, 127 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 697d13961..1117625df 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -2785,7 +2785,7 @@ definitions: properties: type: type: string - enum: [ DynamicDeclarativeStream ] + enum: [DynamicDeclarativeStream] stream_template: title: Stream Template description: Reference to the stream template. diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 58df95c10..fc3d3550d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2216,7 +2216,7 @@ def create_components_mapping_definition( for path in model.field_path ] return ComponentMappingDefinition( - field_path=field_path, + field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString value=interpolated_value, value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), parameters=model.parameters or {}, diff --git a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py index 13ad03442..a6f824e2e 100644 --- a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py @@ -8,6 +8,99 @@ ComponentMappingDefinition, HttpComponentsResolver, ) +from airbyte_cdk.sources.embedded.catalog import ( + to_configured_catalog, + to_configured_stream, +) +from airbyte_cdk.models import Type +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString + +_MANIFEST = { + "version": "5.0.0", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["Rates"]}, + "dynamic_streams": [ + { + "type": "DynamicDeclarativeStream", + "stream_template": { + "type": "DeclarativeStream", + "name": "", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "ABC": {"type": "number"}, + "AED": {"type": "number"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "$parameters": {"item_id": ""}, + "url_base": "https://api.test.com", + "path": "/items/{{parameters['item_id']}}", + "http_method": "GET", + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "paginator": {"type": "NoPagination"}, + }, + }, + "components_resolver": { + "type": "HttpComponentsResolver", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.test.com", + "path": "items", + "http_method": "GET", + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "paginator": {"type": "NoPagination"}, + }, + "components_mapping": [ + { + "type": "ComponentMappingDefinition", + "field_path": ["name"], + "value": "{{components_values['name']}}", + }, + { + "type": "ComponentMappingDefinition", + "field_path": [ + "retriever", + "requester", + "$parameters", + "item_id", + ], + "value": "{{components_values['id']}}", + }, + ], + }, + } + ], +} @pytest.mark.parametrize( @@ -16,29 +109,25 @@ ( [ ComponentMappingDefinition( - field_path=["key1"], + field_path=[InterpolatedString.create("key1", parameters={})], value="{{components_values['key1']}}", value_type=str, parameters={}, - ), + ) ], [{"key1": "updated_value1", "key2": "updated_value2"}], {"key1": None, "key2": None}, - [{"key1": "updated_value1", "key2": None}], # Only key1 is updated + [{"key1": "updated_value1", "key2": None}], ) ], ) def test_http_components_resolver( components_mapping, retriever_data, stream_template_config, expected_result ): - # Mock the retriever to simulate reading records mock_retriever = MagicMock() mock_retriever.read_records.return_value = retriever_data - - # Use a simple dictionary for the config, as Config should be a Mapping config = {} - # Instantiate the resolver with mocked data resolver = HttpComponentsResolver( retriever=mock_retriever, config=config, @@ -46,8 +135,34 @@ def test_http_components_resolver( parameters={}, ) - # Run the resolve_components method and convert the result to a list result = list(resolver.resolve_components(stream_template_config=stream_template_config)) - - # Assert the resolved components match the expected result assert result == expected_result + + +def test_dynamic_streams_read(requests_mock): + expected_stream_names = ["item_1", "item_2"] + requests_mock.get( + "https://api.test.com/items", + json=[{"id": 1, "name": "item_1"}, {"id": 2, "name": "item_2"}], + ) + requests_mock.get("https://api.test.com/items/1", json={"id": "1", "name": "item_1"}) + requests_mock.get("https://api.test.com/items/2", json={"id": "2", "name": "item_2"}) + + source = ManifestDeclarativeSource(source_config=_MANIFEST) + actual_catalog = source.discover(logger=source.logger, config={}) + + configured_streams = [ + to_configured_stream(stream, primary_key=stream.source_defined_primary_key) + for stream in actual_catalog.streams + ] + configured_catalog = to_configured_catalog(configured_streams) + records = [ + message.record + for message in source.read(MagicMock(), {}, configured_catalog) + if message.type == Type.RECORD + ] + + assert len(actual_catalog.streams) == 2 + assert [stream.name for stream in actual_catalog.streams] == expected_stream_names + assert len(records) == 2 + assert [record.stream for record in records] == expected_stream_names From 13d0d0f302131fb29245ed9fa64559a68327e096 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Fri, 29 Nov 2024 18:57:39 +0100 Subject: [PATCH 11/25] Fix formatting --- .../manifest_declarative_source.py | 3 +-- .../parsers/model_to_component_factory.py | 22 +++++++++---------- .../resolvers/components_resolver.py | 8 ++++--- .../resolvers/http_components_resolver.py | 13 ++++++----- .../test_http_components_resolver.py | 10 +++++---- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index c9612ec9d..00db66051 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -33,14 +33,13 @@ from airbyte_cdk.sources.declarative.parsers.manifest_component_transformer import ( ManifestComponentTransformer, ) -from airbyte_cdk.sources.declarative.resolvers import COMPONENTS_RESOLVER_TYPE_MAPPING - from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ( ManifestReferenceResolver, ) from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) +from airbyte_cdk.sources.declarative.resolvers import COMPONENTS_RESOLVER_TYPE_MAPPING from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.streams.core import Stream from airbyte_cdk.sources.types import ConnectionDefinition diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 88b843055..512d65bb1 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -33,10 +33,6 @@ from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository -from airbyte_cdk.sources.declarative.resolvers import ( - HttpComponentsResolver, - ComponentMappingDefinition, -) from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( @@ -124,6 +120,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( CheckStream as CheckStreamModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + ComponentMappingDefinition as ComponentMappingDefinitionModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( CompositeErrorHandler as CompositeErrorHandlerModel, ) @@ -196,6 +195,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( GzipJsonDecoder as GzipJsonDecoderModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + HttpComponentsResolver as HttpComponentsResolverModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( HttpRequester as HttpRequesterModel, ) @@ -284,12 +286,6 @@ SimpleRetriever as SimpleRetrieverModel, ) from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - HttpComponentsResolver as HttpComponentsResolverModel, -) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - ComponentMappingDefinition as ComponentMappingDefinitionModel, -) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( SubstreamPartitionRouter as SubstreamPartitionRouterModel, ) @@ -304,9 +300,9 @@ XmlDecoder as XmlDecoderModel, ) from airbyte_cdk.sources.declarative.partition_routers import ( - PartitionRouter, CartesianProductStreamSlicer, ListPartitionRouter, + PartitionRouter, SinglePartitionRouter, SubstreamPartitionRouter, ) @@ -347,6 +343,10 @@ ) from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod +from airbyte_cdk.sources.declarative.resolvers import ( + ComponentMappingDefinition, + HttpComponentsResolver, +) from airbyte_cdk.sources.declarative.retrievers import ( AsyncRetriever, SimpleRetriever, diff --git a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py index af1d76345..54ca83f54 100644 --- a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py @@ -4,11 +4,13 @@ from abc import ABC, abstractmethod from dataclasses import InitVar, dataclass -from typing import Any, Dict, Mapping, Optional, Type, Union, Iterable, List -from airbyte_cdk.sources.source import ExperimentalClassWarning -from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from typing import Any, Dict, Iterable, List, Mapping, Optional, Type, Union + from deprecated.classic import deprecated +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.source import ExperimentalClassWarning + @dataclass(frozen=True) class ComponentMappingDefinition: diff --git a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py index c3686422d..d120da75b 100644 --- a/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py @@ -4,19 +4,20 @@ from copy import deepcopy from dataclasses import InitVar, dataclass, field -from typing import Any, Dict, List, Mapping, Iterable +from typing import Any, Dict, Iterable, List, Mapping import dpath +from deprecated.classic import deprecated + from airbyte_cdk.sources.declarative.interpolation import InterpolatedString -from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever -from airbyte_cdk.sources.source import ExperimentalClassWarning -from airbyte_cdk.sources.types import Config from airbyte_cdk.sources.declarative.resolvers.components_resolver import ( - ComponentsResolver, ComponentMappingDefinition, + ComponentsResolver, ResolvedComponentMappingDefinition, ) -from deprecated.classic import deprecated +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.types import Config @deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) diff --git a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py index a6f824e2e..727694152 100644 --- a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py @@ -2,8 +2,13 @@ # Copyright (c) 2024 Airbyte, Inc., all rights reserved. # -import pytest from unittest.mock import MagicMock + +import pytest + +from airbyte_cdk.models import Type +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource from airbyte_cdk.sources.declarative.resolvers import ( ComponentMappingDefinition, HttpComponentsResolver, @@ -12,9 +17,6 @@ to_configured_catalog, to_configured_stream, ) -from airbyte_cdk.models import Type -from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource -from airbyte_cdk.sources.declarative.interpolation import InterpolatedString _MANIFEST = { "version": "5.0.0", From c478df513189000276590c0503f3c347a2cac735 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Fri, 29 Nov 2024 19:06:35 +0100 Subject: [PATCH 12/25] Update component schema --- .../sources/declarative/declarative_component_schema.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 1117625df..92962d2be 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1329,7 +1329,7 @@ definitions: type: array items: - type: string - interpolation_content: + interpolation_context: - config examples: - ["data"] @@ -2726,7 +2726,7 @@ definitions: type: array items: - type: string - interpolation_content: + interpolation_context: - config - components_values - stream_template_config From 97a932a9a0024fe813ecf53e860e06ae6c9de431 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Fri, 29 Nov 2024 19:26:42 +0100 Subject: [PATCH 13/25] Add caching to components resolver --- airbyte_cdk/sources/declarative/manifest_declarative_source.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index 00db66051..652da85c4 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -334,6 +334,9 @@ def _dynamic_stream_configs( f"Expected one of {list(COMPONENTS_RESOLVER_TYPE_MAPPING.keys())}." ) + if "retriever" in components_resolver_config: + components_resolver_config["retriever"]["requester"]["use_cache"] = True + # Create a resolver for dynamic components based on type components_resolver = self._constructor.create_component( COMPONENTS_RESOLVER_TYPE_MAPPING[resolver_type], components_resolver_config, config From ce9539c9f01fa75923a232119bf764713bda4026 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Fri, 29 Nov 2024 19:30:54 +0100 Subject: [PATCH 14/25] Fix description for fields --- .../sources/declarative/declarative_component_schema.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 92962d2be..b94517ec2 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -2756,7 +2756,7 @@ definitions: additionalProperties: true HttpComponentsResolver: type: object - description: Component resolve and populates stream templates with components fetched via an HTTP retriever. (This component is experimental. Use at your own risk.) + description: (This component is experimental. Use at your own risk.) Component resolve and populates stream templates with components fetched via an HTTP retriever. properties: type: type: string @@ -2781,7 +2781,7 @@ definitions: - components_mapping DynamicDeclarativeStream: type: object - description: A component that described how will be created declarative streams based on stream template. (This component is experimental. Use at your own risk.) + description: (This component is experimental. Use at your own risk.) A component that described how will be created declarative streams based on stream template. properties: type: type: string From 0160353fec7bb6e233f6d5a4ebebc55b5334b6cd Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Fri, 29 Nov 2024 18:33:47 +0000 Subject: [PATCH 15/25] Auto-fix lint and format issues --- .../models/declarative_component_schema.py | 74 +++++++------------ 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index e3adfadcf..cec3c7362 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -528,9 +528,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -754,21 +752,19 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -786,9 +782,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1479,25 +1473,21 @@ class Config: description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( - Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", - ) - ) - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) - schema_loader: Optional[ - Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] - ] = Field( - None, - description="Component used to retrieve the schema for the current stream.", - title="Schema Loader", + schema_loader: Optional[Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader]] = ( + Field( + None, + description="Component used to retrieve the schema for the current stream.", + title="Schema Loader", + ) ) transformations: Optional[ List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] @@ -1715,11 +1705,7 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -1788,11 +1774,7 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], From 284241d9c0ffd9cbb9df4cc71ee48d75d17eb914 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Mon, 2 Dec 2024 22:37:38 +0100 Subject: [PATCH 16/25] Update unit tests with Maxime comments --- .../concurrent_declarative_source.py | 4 +- .../models/declarative_component_schema.py | 74 ++++----- .../parsers/manifest_component_transformer.py | 6 + .../test_http_components_resolver.py | 63 +++++--- .../test_manifest_declarative_source.py | 147 +++++++++++++++++- 5 files changed, 217 insertions(+), 77 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index ef01a82bd..e156290a5 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -184,8 +184,8 @@ def _group_streams( # Combine streams and dynamic_streams. Note: both cannot be empty at the same time, # and this is validated during the initialization of the source. - streams = self.resolved_manifest.get("streams", []) + self.resolved_manifest.get( - "dynamic_streams", [] + streams = self._stream_configs(self._source_config) + self._dynamic_stream_configs( + self._source_config, config ) name_to_stream_mapping = {stream["name"]: stream for stream in streams} diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index e3adfadcf..cec3c7362 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -528,9 +528,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -754,21 +752,19 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -786,9 +782,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1479,25 +1473,21 @@ class Config: description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( - Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", - ) - ) - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) - schema_loader: Optional[ - Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] - ] = Field( - None, - description="Component used to retrieve the schema for the current stream.", - title="Schema Loader", + schema_loader: Optional[Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader]] = ( + Field( + None, + description="Component used to retrieve the schema for the current stream.", + title="Schema Loader", + ) ) transformations: Optional[ List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] @@ -1715,11 +1705,7 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -1788,11 +1774,7 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py b/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py index 8cacda3d7..ed05b8e52 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py @@ -31,6 +31,12 @@ # DeclarativeStream "DeclarativeStream.retriever": "SimpleRetriever", "DeclarativeStream.schema_loader": "JsonFileSchemaLoader", + # DynamicDeclarativeStream + "DynamicDeclarativeStream.stream_template": "DeclarativeStream", + "DynamicDeclarativeStream.components_resolver": "HttpComponentsResolver", + # HttpComponentsResolver + "HttpComponentsResolver.retriever": "SimpleRetriever", + "HttpComponentsResolver.components_mapping": "ComponentMappingDefinition", # DefaultErrorHandler "DefaultErrorHandler.response_filters": "HttpResponseFilter", # DefaultPaginator diff --git a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py index 727694152..08a8e8c63 100644 --- a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py @@ -2,13 +2,16 @@ # Copyright (c) 2024 Airbyte, Inc., all rights reserved. # +import json from unittest.mock import MagicMock import pytest from airbyte_cdk.models import Type +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) from airbyte_cdk.sources.declarative.interpolation import InterpolatedString -from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource from airbyte_cdk.sources.declarative.resolvers import ( ComponentMappingDefinition, HttpComponentsResolver, @@ -17,6 +20,9 @@ to_configured_catalog, to_configured_stream, ) +from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse + +_CONFIG = {"start_date": "2024-07-01T00:00:00.000Z"} _MANIFEST = { "version": "5.0.0", @@ -141,28 +147,41 @@ def test_http_components_resolver( assert result == expected_result -def test_dynamic_streams_read(requests_mock): +def test_dynamic_streams_read(): expected_stream_names = ["item_1", "item_2"] - requests_mock.get( - "https://api.test.com/items", - json=[{"id": 1, "name": "item_1"}, {"id": 2, "name": "item_2"}], - ) - requests_mock.get("https://api.test.com/items/1", json={"id": "1", "name": "item_1"}) - requests_mock.get("https://api.test.com/items/2", json={"id": "2", "name": "item_2"}) - - source = ManifestDeclarativeSource(source_config=_MANIFEST) - actual_catalog = source.discover(logger=source.logger, config={}) - - configured_streams = [ - to_configured_stream(stream, primary_key=stream.source_defined_primary_key) - for stream in actual_catalog.streams - ] - configured_catalog = to_configured_catalog(configured_streams) - records = [ - message.record - for message in source.read(MagicMock(), {}, configured_catalog) - if message.type == Type.RECORD - ] + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/items"), + HttpResponse( + body=json.dumps([{"id": 1, "name": "item_1"}, {"id": 2, "name": "item_2"}]) + ), + ) + http_mocker.get( + HttpRequest(url="https://api.test.com/items/1"), + HttpResponse(body=json.dumps({"id": "1", "name": "item_1"})), + ) + http_mocker.get( + HttpRequest(url="https://api.test.com/items/2"), + HttpResponse(body=json.dumps({"id": "2", "name": "item_2"})), + ) + + source = ConcurrentDeclarativeSource( + source_config=_MANIFEST, config=_CONFIG, catalog=None, state=None + ) + + actual_catalog = source.discover(logger=source.logger, config={}) + + configured_streams = [ + to_configured_stream(stream, primary_key=stream.source_defined_primary_key) + for stream in actual_catalog.streams + ] + configured_catalog = to_configured_catalog(configured_streams) + + records = [ + message.record + for message in source.read(MagicMock(), {}, configured_catalog) + if message.type == Type.RECORD + ] assert len(actual_catalog.streams) == 2 assert [stream.name for stream in actual_catalog.streams] == expected_stream_names diff --git a/unit_tests/sources/declarative/test_manifest_declarative_source.py b/unit_tests/sources/declarative/test_manifest_declarative_source.py index 1f4b6df56..ea92bac5e 100644 --- a/unit_tests/sources/declarative/test_manifest_declarative_source.py +++ b/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -71,6 +71,116 @@ def use_external_yaml_spec(self): yield os.remove(yaml_path) + @pytest.fixture + def _base_manifest(self): + """Base manifest without streams or dynamic streams.""" + return { + "version": "3.8.2", + "description": "This is a sample source connector that is very valid.", + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + + @pytest.fixture + def _declarative_stream(self): + def declarative_stream_config( + name="lists", requester_type="HttpRequester", custom_requester=None + ): + """Generates a DeclarativeStream configuration.""" + requester_config = { + "type": requester_type, + "path": "/v3/marketing/lists", + "authenticator": { + "type": "BearerAuthenticator", + "api_token": "{{ config.apikey }}", + }, + "request_parameters": {"page_size": "{{ 10 }}"}, + } + if custom_requester: + requester_config.update(custom_requester) + + return { + "type": "DeclarativeStream", + "$parameters": { + "name": name, + "primary_key": "id", + "url_base": "https://api.sendgrid.com", + }, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": f"./source_sendgrid/schemas/{{{{ parameters.name }}}}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": { + "type": "RequestOption", + "inject_into": "request_parameter", + "field_name": "page_size", + }, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 10, + }, + }, + "requester": requester_config, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + } + + return declarative_stream_config + + @pytest.fixture + def _dynamic_declarative_stream(self, _declarative_stream): + """Generates a DynamicDeclarativeStream configuration.""" + return { + "type": "DynamicDeclarativeStream", + "stream_template": _declarative_stream(), + "components_resolver": { + "type": "HttpComponentsResolver", + "$parameters": { + "name": "lists", + "primary_key": "id", + "url_base": "https://api.sendgrid.com", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": { + "type": "RequestOption", + "inject_into": "request_parameter", + "field_name": "page_size", + }, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 10, + }, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": { + "type": "BearerAuthenticator", + "api_token": "{{ config.apikey }}", + }, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + "components_mapping": [ + { + "type": "ComponentMappingDefinition", + "field_path": ["name"], + "value": "{{ components_value['name'] }}", + } + ], + }, + } + def test_valid_manifest(self): manifest = { "version": "3.8.2", @@ -516,14 +626,37 @@ def test_source_missing_checker_fails_validation(self): with pytest.raises(ValidationError): ManifestDeclarativeSource(source_config=manifest) - def test_source_with_missing_streams_fails(self): - manifest = { - "version": "0.29.3", - "definitions": None, - "check": {"type": "CheckStream", "stream_names": ["lists"]}, - } + def test_source_with_missing_streams_and_dynamic_streams_fails( + self, _base_manifest, _dynamic_declarative_stream, _declarative_stream + ): + # test case for manifest without streams or dynamic streams + manifest_without_streams_and_dynamic_streams = _base_manifest with pytest.raises(ValidationError): - ManifestDeclarativeSource(source_config=manifest) + ManifestDeclarativeSource(source_config=manifest_without_streams_and_dynamic_streams) + + # test case for manifest with streams + manifest_with_streams = { + **manifest_without_streams_and_dynamic_streams, + "streams": [ + _declarative_stream(name="lists"), + _declarative_stream( + name="stream_with_custom_requester", + requester_type="CustomRequester", + custom_requester={ + "class_name": "unit_tests.sources.declarative.external_component.SampleCustomComponent", + "custom_request_parameters": {"page_size": 10}, + }, + ), + ], + } + ManifestDeclarativeSource(source_config=manifest_with_streams) + + # test case for manifest with dynamic streams + manifest_with_dynamic_streams = { + **manifest_without_streams_and_dynamic_streams, + "dynamic_streams": [_dynamic_declarative_stream], + } + ManifestDeclarativeSource(source_config=manifest_with_dynamic_streams) def test_source_with_missing_version_fails(self): manifest = { From be478ae927a8fecdd320c1d0185856a268d599da Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 3 Dec 2024 04:36:48 +0100 Subject: [PATCH 17/25] Add dynamic schema loader --- airbyte_cdk/models/__init__.py | 2 - .../declarative_component_schema.yaml | 124 ++++++++-- .../models/declarative_component_schema.py | 135 ++++++++--- .../parsers/manifest_component_transformer.py | 4 + .../parsers/model_to_component_factory.py | 126 ++++++++-- .../declarative/partition_routers/__init__.py | 4 +- .../sources/declarative/schema/__init__.py | 3 +- .../schema/dynamic_schema_loader.py | 223 ++++++++++++++++++ .../schema/test_dynamic_schema_loader.py | 148 ++++++++++++ 9 files changed, 691 insertions(+), 78 deletions(-) create mode 100644 airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py create mode 100644 unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py diff --git a/airbyte_cdk/models/__init__.py b/airbyte_cdk/models/__init__.py index 1105cbeda..c56df9adc 100644 --- a/airbyte_cdk/models/__init__.py +++ b/airbyte_cdk/models/__init__.py @@ -39,9 +39,7 @@ FailureType, Level, OAuthConfigSpecification, - OauthConnectorInputSpecification, OrchestratorType, - State, Status, StreamDescriptor, SyncMode, diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 83b318bf0..52d7fb2ad 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1663,6 +1663,97 @@ definitions: $parameters: type: object additionalProperties: true + TypesPair: + title: Types Pair + description: (This component is experimental. Use at your own risk.) Represents a mapping between a current type and its corresponding target type. + type: object + required: + - target_type + - current_type + properties: + target_type: + anyOf: + - type: string + - type: array + items: + type: string + current_type: + anyOf: + - type: string + - type: array + items: + type: string + SchemaTypeIdentifier: + title: Schema Type Identifier + description: (This component is experimental. Use at your own risk.) Identifies schema details for dynamic schema extraction and processing. + type: object + required: + - schema_pointer + - key_pointer + properties: + type: + type: string + enum: [SchemaTypeIdentifier] + schema_pointer: + title: Schema Path + description: List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector). + type: array + items: + - type: string + interpolation_content: + - config + key_pointer: + title: Key Path + description: List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector). + type: array + items: + - type: string + interpolation_content: + - config + type_pointer: + title: Type Path + description: List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector). + type: array + items: + - type: string + interpolation_content: + - config + is_nullable: + title: Is Nullable + description: Add null to defined field type. This field is automatically set by the CDK. + type: boolean + default: true + types_map: + type: array + items: + - "$ref": "#/definitions/TypesPair" + $parameters: + type: object + additionalProperties: true + DynamicSchemaLoader: + title: Dynamic Schema Loader + description: (This component is experimental. Use at your own risk.) Loads a schema by extracting data from retrieved records. + type: object + required: + - type + - retriever + - schema_type_identifier + properties: + type: + type: string + enum: [DynamicSchemaLoader] + retriever: + title: Retriever + description: Component used to coordinate how records are extracted across stream slices and request pages. + anyOf: + - "$ref": "#/definitions/AsyncRetriever" + - "$ref": "#/definitions/CustomRetriever" + - "$ref": "#/definitions/SimpleRetriever" + schema_type_identifier: + "$ref": "#/definitions/SchemaTypeIdentifier" + $parameters: + type: object + additionalProperties: true InlineSchemaLoader: title: Inline Schema Loader description: Loads a schema that is defined directly in the manifest file. @@ -2044,7 +2135,7 @@ definitions: The DeclarativeOAuth Specific URL templated string to obtain the `access_token`, `refresh_token` etc. The placeholders are replaced during the processing to provide neccessary values. examples: - - access_token_url: https://auth.host.com/oauth2/token?{client_id_key}={{client_id_key}}&{client_secret_key}={{client_secret_key}}&{auth_code_key}={{auth_code_key}}&{redirect_uri_key}={urlEncoder:{{redirect_uri_key}}} + - access_token_url: https://auth.host.com/oauth2/token?{client_id_key}={{client_id_key}}&{client_secret_key}={{client_secret_key}}&{auth_code_key}={{auth_code_key}}&{redirect_uri_key}={urlEncoder:{{redirect_uri_key}}} access_token_headers: title: (Optional) DeclarativeOAuth Access Token Headers type: object @@ -2052,9 +2143,10 @@ definitions: description: |- The DeclarativeOAuth Specific optional headers to inject while exchanging the `auth_code` to `access_token` during `completeOAuthFlow` step. examples: - - access_token_headers: { - "Authorization": "Basic {base64Encoder:{client_id}:{client_secret}}" - } + - access_token_headers: + { + "Authorization": "Basic {base64Encoder:{client_id}:{client_secret}}", + } access_token_params: title: (Optional) DeclarativeOAuth Access Token Query Params (Json Encoded) type: object @@ -2063,18 +2155,19 @@ definitions: The DeclarativeOAuth Specific optional query parameters to inject while exchanging the `auth_code` to `access_token` during `completeOAuthFlow` step. When this property is provided, the query params will be encoded as `Json` and included in the outgoing API request. examples: - - access_token_params: { - "{auth_code_key}": "{{auth_code_key}}", - "{client_id_key}": "{{client_id_key}}", - "{client_secret_key}": "{{client_secret_key}}" - } + - access_token_params: + { + "{auth_code_key}": "{{auth_code_key}}", + "{client_id_key}": "{{client_id_key}}", + "{client_secret_key}": "{{client_secret_key}}", + } extract_output: title: DeclarativeOAuth Extract Output type: array items: type: string description: |- - The DeclarativeOAuth Specific list of strings to indicate which keys should be extracted and returned back to the input config. + The DeclarativeOAuth Specific list of strings to indicate which keys should be extracted and returned back to the input config. examples: - extract_output: ["access_token", "refresh_token", "other_field"] state: @@ -2086,17 +2179,14 @@ definitions: - max description: |- The DeclarativeOAuth Specific object to provide the criteria of how the `state` query param should be constructed, - including length and complexity. + including length and complexity. properties: min: type: integer max: type: integer examples: - - state: { - "min": 7, - "max": 128, - } + - state: { "min": 7, "max": 128 } client_id_key: title: (Optional) DeclarativeOAuth Client ID Key Override type: string @@ -2122,14 +2212,14 @@ definitions: title: (Optional) DeclarativeOAuth State Key Override type: string description: |- - The DeclarativeOAuth Specific optional override to provide the custom `state` key name, if required by data-provider. + The DeclarativeOAuth Specific optional override to provide the custom `state` key name, if required by data-provider. examples: - state_key: "my_custom_state_key_key_name" auth_code_key: title: (Optional) DeclarativeOAuth Auth Code Key Override type: string description: |- - The DeclarativeOAuth Specific optional override to provide the custom `code` key name to something like `auth_code` or `custom_auth_code`, if required by data-provider. + The DeclarativeOAuth Specific optional override to provide the custom `code` key name to something like `auth_code` or `custom_auth_code`, if required by data-provider. examples: - auth_code_key: "my_custom_auth_code_key_name" redirect_uri_key: diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 3cfba631e..d16a8c3bb 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -528,7 +528,9 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], + examples=[ + ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] + ], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -645,6 +647,37 @@ class HttpResponseFilter(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class TypesPair(BaseModel): + target_type: Union[str, List[str]] + current_type: Union[str, List[str]] + + +class SchemaTypeIdentifier(BaseModel): + type: Optional[Literal["SchemaTypeIdentifier"]] = None + schema_pointer: List[str] = Field( + ..., + description='List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector).', + title="Schema Path", + ) + key_pointer: List[str] = Field( + ..., + description='List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector).', + title="Key Path", + ) + type_pointer: Optional[List[str]] = Field( + None, + description='List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector).', + title="Type Path", + ) + is_nullable: Optional[bool] = Field( + True, + description="Add null to defined field type. This field is automatically set by the CDK.", + title="Is Nullable", + ) + types_map: Optional[List[TypesPair]] = None + parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") + + class InlineSchemaLoader(BaseModel): type: Literal["InlineSchemaLoader"] schema_: Optional[Dict[str, Any]] = Field( @@ -817,13 +850,13 @@ class Config: ) extract_output: List[str] = Field( ..., - description="The DeclarativeOAuth Specific list of strings to indicate which keys should be extracted and returned back to the input config. ", + description="The DeclarativeOAuth Specific list of strings to indicate which keys should be extracted and returned back to the input config.", examples=[{"extract_output": ["access_token", "refresh_token", "other_field"]}], title="DeclarativeOAuth Extract Output", ) state: Optional[State] = Field( None, - description="The DeclarativeOAuth Specific object to provide the criteria of how the `state` query param should be constructed,\nincluding length and complexity. ", + description="The DeclarativeOAuth Specific object to provide the criteria of how the `state` query param should be constructed,\nincluding length and complexity.", examples=[{"state": {"min": 7, "max": 128}}], title="(Optional) DeclarativeOAuth Configurable State Query Param", ) @@ -847,13 +880,13 @@ class Config: ) state_key: Optional[str] = Field( None, - description="The DeclarativeOAuth Specific optional override to provide the custom `state` key name, if required by data-provider. ", + description="The DeclarativeOAuth Specific optional override to provide the custom `state` key name, if required by data-provider.", examples=[{"state_key": "my_custom_state_key_key_name"}], title="(Optional) DeclarativeOAuth State Key Override", ) auth_code_key: Optional[str] = Field( None, - description="The DeclarativeOAuth Specific optional override to provide the custom `code` key name to something like `auth_code` or `custom_auth_code`, if required by data-provider. ", + description="The DeclarativeOAuth Specific optional override to provide the custom `code` key name to something like `auth_code` or `custom_auth_code`, if required by data-provider.", examples=[{"auth_code_key": "my_custom_auth_code_key_name"}], title="(Optional) DeclarativeOAuth Auth Code Key Override", ) @@ -869,24 +902,28 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( + Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", + ) ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{my_var}`.\n- The nested resolution variables like `{{my_nested_var}}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {base64Encoder:{my_var_a}:{my_var_b}}\n + base64Decorer - decode from `base64` encoded string, {base64Decoder:{my_string_variable_or_string_value}}\n + urlEncoder - encode the input string to URL-like format, {urlEncoder:https://test.host.com/endpoint}\n + urlDecorer - decode the input url-encoded string into text format, {urlDecoder:https%3A%2F%2Fairbyte.io}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {codeChallengeS256:{state_value}}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{client_id_key}={{client_id_key}}&{redirect_uri_key}={urlEncoder:{{redirect_uri_key}}}&{state_key}={{state_key}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{auth_code_key}": "{{auth_code_key}}",\n "{client_id_key}": "{{client_id_key}}",\n "{client_secret_key}": "{{client_secret_key}}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( + Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{my_var}`.\n- The nested resolution variables like `{{my_nested_var}}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {base64Encoder:{my_var_a}:{my_var_b}}\n + base64Decorer - decode from `base64` encoded string, {base64Decoder:{my_string_variable_or_string_value}}\n + urlEncoder - encode the input string to URL-like format, {urlEncoder:https://test.host.com/endpoint}\n + urlDecorer - decode the input url-encoded string into text format, {urlDecoder:https%3A%2F%2Fairbyte.io}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {codeChallengeS256:{state_value}}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{client_id_key}={{client_id_key}}&{redirect_uri_key}={urlEncoder:{{redirect_uri_key}}}&{state_key}={{state_key}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{auth_code_key}": "{{auth_code_key}}",\n "{client_id_key}": "{{client_id_key}}",\n "{client_secret_key}": "{{client_secret_key}}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", + ) ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -904,7 +941,9 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], + examples=[ + {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} + ], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1526,21 +1565,25 @@ class Config: description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( + Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", + ) + ) + name: Optional[str] = Field( + "", description="The stream name.", example=["Users"], title="Name" ) - name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) - schema_loader: Optional[Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader]] = ( - Field( - None, - description="Component used to retrieve the schema for the current stream.", - title="Schema Loader", - ) + schema_loader: Optional[ + Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] + ] = Field( + None, + description="Component used to retrieve the schema for the current stream.", + title="Schema Loader", ) transformations: Optional[ List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] @@ -1700,6 +1743,17 @@ class HttpRequester(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class DynamicSchemaLoader(BaseModel): + type: Literal["DynamicSchemaLoader"] + retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field( + ..., + description="Component used to coordinate how records are extracted across stream slices and request pages.", + title="Retriever", + ) + schema_type_identifier: SchemaTypeIdentifier + parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") + + class ParentStreamConfig(BaseModel): type: Literal["ParentStreamConfig"] parent_key: str = Field( @@ -1758,7 +1812,11 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -1827,7 +1885,11 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -1866,5 +1928,6 @@ class SubstreamPartitionRouter(BaseModel): SelectiveAuthenticator.update_forward_refs() DeclarativeStream.update_forward_refs() SessionTokenAuthenticator.update_forward_refs() +DynamicSchemaLoader.update_forward_refs() SimpleRetriever.update_forward_refs() AsyncRetriever.update_forward_refs() diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py b/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py index 8cacda3d7..8cea2a825 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py @@ -58,6 +58,10 @@ "AddFields.fields": "AddedFieldDefinition", # CustomPartitionRouter "CustomPartitionRouter.parent_stream_configs": "ParentStreamConfig", + # DynamicSchemaLoader + "DynamicSchemaLoader.retriever": "SimpleRetriever", + # SchemaTypeIdentifier + "SchemaTypeIdentifier.types_map": "TypesPair", } # We retain a separate registry for custom components to automatically insert the type if it is missing. This is intended to diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index e8f7a9b74..fb3d0f0af 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -186,6 +186,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( DpathExtractor as DpathExtractorModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + DynamicSchemaLoader as DynamicSchemaLoaderModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, ) @@ -270,6 +273,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( RequestPath as RequestPathModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + SchemaTypeIdentifier as SchemaTypeIdentifierModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( SelectiveAuthenticator as SelectiveAuthenticatorModel, ) @@ -283,6 +289,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( SubstreamPartitionRouter as SubstreamPartitionRouterModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + TypesPair as TypesPairModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( WaitTimeFromHeader as WaitTimeFromHeaderModel, @@ -296,6 +305,7 @@ from airbyte_cdk.sources.declarative.partition_routers import ( CartesianProductStreamSlicer, ListPartitionRouter, + PartitionRouter, SinglePartitionRouter, SubstreamPartitionRouter, ) @@ -343,8 +353,11 @@ ) from airbyte_cdk.sources.declarative.schema import ( DefaultSchemaLoader, + DynamicSchemaLoader, InlineSchemaLoader, JsonFileSchemaLoader, + SchemaTypeIdentifier, + TypesPair, ) from airbyte_cdk.sources.declarative.spec import Spec from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicer @@ -439,6 +452,9 @@ def _init_mappings(self) -> None: IterableDecoderModel: self.create_iterable_decoder, XmlDecoderModel: self.create_xml_decoder, JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, + DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, + SchemaTypeIdentifierModel: self.create_schema_type_identifier, + TypesPairModel: self.create_types_pair, JwtAuthenticatorModel: self.create_jwt_authenticator, LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, ListPartitionRouterModel: self.create_list_partition_router, @@ -1280,19 +1296,20 @@ def create_declarative_stream( parameters=model.parameters or {}, ) - def _merge_stream_slicers( - self, model: DeclarativeStreamModel, config: Config - ) -> Optional[StreamSlicer]: - stream_slicer = None + def _build_stream_slicer_from_partition_router( + self, + model: Union[AsyncRetrieverModel, CustomRetrieverModel, SimpleRetrieverModel], + config: Config, + ) -> Optional[PartitionRouter]: if ( - hasattr(model.retriever, "partition_router") - and isinstance(model.retriever, SimpleRetrieverModel) - and model.retriever.partition_router + hasattr(model, "partition_router") + and isinstance(model, SimpleRetrieverModel) + and model.partition_router ): - stream_slicer_model = model.retriever.partition_router + stream_slicer_model = model.partition_router if isinstance(stream_slicer_model, list): - stream_slicer = CartesianProductStreamSlicer( + return CartesianProductStreamSlicer( [ self._create_component_from_model(model=slicer, config=config) for slicer in stream_slicer_model @@ -1300,9 +1317,24 @@ def _merge_stream_slicers( parameters={}, ) else: - stream_slicer = self._create_component_from_model( - model=stream_slicer_model, config=config - ) + return self._create_component_from_model(model=stream_slicer_model, config=config) # type: ignore[no-any-return] + # Will be created PartitionRouter as stream_slicer_model is model.partition_router + return None + + def _build_resumable_cursor_from_paginator( + self, + model: Union[AsyncRetrieverModel, CustomRetrieverModel, SimpleRetrieverModel], + stream_slicer: Optional[StreamSlicer], + ) -> Optional[StreamSlicer]: + if hasattr(model, "paginator") and model.paginator and not stream_slicer: + # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` + return ResumableFullRefreshCursor(parameters={}) + return None + + def _merge_stream_slicers( + self, model: DeclarativeStreamModel, config: Config + ) -> Optional[StreamSlicer]: + stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) if model.incremental_sync and stream_slicer: incremental_sync_model = model.incremental_sync @@ -1343,15 +1375,7 @@ def _merge_stream_slicers( ), partition_router=stream_slicer, ) - elif ( - hasattr(model.retriever, "paginator") - and model.retriever.paginator - and not stream_slicer - ): - # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` - return ResumableFullRefreshCursor(parameters={}) - else: - return None + return self._build_resumable_cursor_from_paginator(model.retriever, stream_slicer) def create_default_error_handler( self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any @@ -1543,6 +1567,66 @@ def create_inline_schema_loader( ) -> InlineSchemaLoader: return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) + @staticmethod + def create_types_pair(model: TypesPairModel, **kwargs: Any) -> TypesPair: + return TypesPair(target_type=model.target_type, current_type=model.current_type) + + def create_schema_type_identifier( + self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any + ) -> SchemaTypeIdentifier: + types_map = [] + if model.types_map: + types_map.extend( + [ + self._create_component_from_model(types_pair, config=config) + for types_pair in model.types_map + ] + ) + model_schema_pointer: List[Union[InterpolatedString, str]] = [ + x for x in model.schema_pointer + ] + model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] + model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( + [x for x in model.type_pointer] if model.type_pointer else None + ) + + assert model.is_nullable is not None # for mypy + + return SchemaTypeIdentifier( + schema_pointer=model_schema_pointer, + key_pointer=model_key_pointer, + type_pointer=model_type_pointer, + types_map=types_map, + is_nullable=model.is_nullable, + parameters=model.parameters or {}, + ) + + def create_dynamic_schema_loader( + self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any + ) -> DynamicSchemaLoader: + stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) + combined_slicers = self._build_resumable_cursor_from_paginator( + model.retriever, stream_slicer + ) + + retriever = self._create_component_from_model( + model=model.retriever, + config=config, + name="", + primary_key=None, + stream_slicer=combined_slicers, + transformations=[], + ) + schema_type_identifier = self._create_component_from_model( + model.schema_type_identifier, config=config, parameters=model.parameters or {} + ) + return DynamicSchemaLoader( + retriever=retriever, + config=config, + schema_type_identifier=schema_type_identifier, + parameters=model.parameters or {}, + ) + @staticmethod def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder: return JsonDecoder(parameters={}) diff --git a/airbyte_cdk/sources/declarative/partition_routers/__init__.py b/airbyte_cdk/sources/declarative/partition_routers/__init__.py index 86e472a42..811bb5c57 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/__init__.py +++ b/airbyte_cdk/sources/declarative/partition_routers/__init__.py @@ -6,5 +6,7 @@ from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import ListPartitionRouter from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import SubstreamPartitionRouter +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter + +__all__ = ["CartesianProductStreamSlicer", "ListPartitionRouter", "SinglePartitionRouter", "SubstreamPartitionRouter", "PartitionRouter"] -__all__ = ["CartesianProductStreamSlicer", "ListPartitionRouter", "SinglePartitionRouter", "SubstreamPartitionRouter"] diff --git a/airbyte_cdk/sources/declarative/schema/__init__.py b/airbyte_cdk/sources/declarative/schema/__init__.py index fee72f44f..f6b7cd918 100644 --- a/airbyte_cdk/sources/declarative/schema/__init__.py +++ b/airbyte_cdk/sources/declarative/schema/__init__.py @@ -6,5 +6,6 @@ from airbyte_cdk.sources.declarative.schema.inline_schema_loader import InlineSchemaLoader from airbyte_cdk.sources.declarative.schema.json_file_schema_loader import JsonFileSchemaLoader from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.declarative.schema.dynamic_schema_loader import DynamicSchemaLoader, TypesPair, SchemaTypeIdentifier -__all__ = ["JsonFileSchemaLoader", "DefaultSchemaLoader", "SchemaLoader", "InlineSchemaLoader"] +__all__ = ["JsonFileSchemaLoader", "DefaultSchemaLoader", "SchemaLoader", "InlineSchemaLoader", "DynamicSchemaLoader", "TypesPair", "SchemaTypeIdentifier"] diff --git a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py new file mode 100644 index 000000000..733d9a5f7 --- /dev/null +++ b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py @@ -0,0 +1,223 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + + +from copy import deepcopy +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, MutableMapping, Optional, Union + +import dpath +from deprecated.classic import deprecated + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.types import Config + +AIRBYTE_DATA_TYPES = { + "string": {"type": "string"}, + "boolean": {"type": "boolean"}, + "date": {"type": "string", "format": "date"}, + "timestamp_without_timezone": { + "type": "string", + "format": "date-time", + "airbyte_type": "timestamp_without_timezone", + }, + "timestamp_with_timezone": {"type": "string", "format": "date-time"}, + "time_without_timezone": { + "type": "string", + "format": "time", + "airbyte_type": "time_without_timezone", + }, + "time_with_timezone": { + "type": "string", + "format": "time", + "airbyte_type": "time_with_timezone", + }, + "integer": {"type": "integer"}, + "number": {"type": "number"}, + "array": {"type": "array"}, + "object": {"type": "object"}, +} + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +@dataclass(frozen=True) +class TypesPair: + """ + Represents a mapping between a current type and its corresponding target type. + """ + + target_type: Union[List[str], str] + current_type: Union[List[str], str] + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +@dataclass +class SchemaTypeIdentifier: + """ + Identifies schema details for dynamic schema extraction and processing. + """ + + schema_pointer: List[Union[InterpolatedString, str]] + key_pointer: List[Union[InterpolatedString, str]] + parameters: InitVar[Mapping[str, Any]] + type_pointer: Optional[List[Union[InterpolatedString, str]]] = None + types_map: List[TypesPair] = None + is_nullable: bool = True + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self.schema_pointer = self._update_pointer(self.schema_pointer, parameters) + self.key_pointer = self._update_pointer(self.key_pointer, parameters) + self.type_pointer = ( + self._update_pointer(self.type_pointer, parameters) if self.type_pointer else None + ) + + @staticmethod + def _update_pointer( + pointer: Optional[List[Union[InterpolatedString, str]]], parameters: Mapping[str, Any] + ) -> Optional[List[Union[InterpolatedString, str]]]: + return ( + [ + InterpolatedString.create(path, parameters=parameters) + if isinstance(path, str) + else path + for path in pointer + ] + if pointer + else None + ) + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +@dataclass +class DynamicSchemaLoader(SchemaLoader): + """ + Dynamically loads a JSON Schema by extracting data from retrieved records. + """ + + retriever: Retriever + config: Config + parameters: InitVar[Mapping[str, Any]] + schema_type_identifier: SchemaTypeIdentifier + + def get_json_schema(self) -> Mapping[str, Any]: + """ + Constructs a JSON Schema based on retrieved data. + """ + properties = {} + for retrieved_record in self.retriever.read_records({}): + raw_schema = self._extract_data( + retrieved_record, self.schema_type_identifier.schema_pointer + ) + for property_definition in raw_schema: + key = self._get_key(property_definition, self.schema_type_identifier.key_pointer) + value = self._get_type( + property_definition, + self.schema_type_identifier.type_pointer, + is_nullable=self.schema_type_identifier.is_nullable, + ) + properties[key] = value + + return { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": properties, + } + + def _get_key( + self, + raw_schema: MutableMapping[str, Any], + field_key_path: List[Union[InterpolatedString, str]], + ) -> str: + """ + Extracts the key field from the schema using the specified path. + """ + field_key = self._extract_data(raw_schema, field_key_path) + if not isinstance(field_key, str): + raise ValueError(f"Expected key to be a string. Got {field_key}") + return field_key + + def _get_type( + self, + raw_schema: MutableMapping[str, Any], + field_type_path: Optional[List[Union[InterpolatedString, str]]], + is_nullable: bool = True, + ) -> Union[Mapping[str, Any], List[Mapping[str, Any]]]: + """ + Determines the JSON Schema type for a field, supporting nullable and combined types. + """ + raw_field_type = ( + self._extract_data(raw_schema, field_type_path, default="string") + if field_type_path + else "string" + ) + mapped_field_type = self._replace_type_if_not_valid(raw_field_type) + if ( + isinstance(mapped_field_type, list) + and len(mapped_field_type) == 2 + and all(isinstance(item, str) for item in mapped_field_type) + ): + first_type = self._make_field_nullable( + self._get_airbyte_type(mapped_field_type[0]), is_nullable + ) + second_type = self._make_field_nullable( + self._get_airbyte_type(mapped_field_type[1]), is_nullable + ) + return {"oneOf": [first_type, second_type]} + return self._make_field_nullable(self._get_airbyte_type(mapped_field_type), is_nullable) + + def _replace_type_if_not_valid(self, field_type: str) -> str: + """ + Replaces a field type if it matches a type mapping in `types_map`. + """ + for types_pair in self.schema_type_identifier.types_map: + if field_type == types_pair.current_type: + return types_pair.target_type + return field_type + + @staticmethod + def _make_field_nullable( + field_type: Mapping[str, Any], is_nullable: bool = True + ) -> Mapping[str, Any]: + """ + Wraps a field type to allow null values if `is_nullable` is True. + """ + + if is_nullable: + field_type["type"] = ["null", field_type["type"]] + return field_type + + @staticmethod + def _get_airbyte_type(field_type: str) -> Mapping[str, Any]: + """ + Maps a field type to its corresponding Airbyte type definition. + """ + if field_type not in AIRBYTE_DATA_TYPES: + raise ValueError(f"Invalid Airbyte data type: {field_type}") + + return deepcopy(AIRBYTE_DATA_TYPES[field_type]) + + def _extract_data( + self, + body: Mapping[str, Any], + extraction_path: List[Union[InterpolatedString, str]], + default: Any = None, + ) -> Any: + """ + Extracts data from the body based on the provided extraction path. + """ + + if len(extraction_path) == 0: + return body + + path = [path.eval(self.config) for path in extraction_path] + + if "*" in path: + extracted = dpath.values(body, path) + else: + extracted = dpath.get(body, path, default=default) + + return extracted diff --git a/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py b/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py new file mode 100644 index 000000000..04fcd0182 --- /dev/null +++ b/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py @@ -0,0 +1,148 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest + +from airbyte_cdk.sources.declarative.schema import DynamicSchemaLoader, SchemaTypeIdentifier + + +@pytest.fixture +def mock_retriever(): + retriever = MagicMock() + retriever.read_records.return_value = [ + { + "schema": [ + {"field1": {"key": "name", "type": "string"}}, + {"field2": {"key": "age", "type": "integer"}}, + {"field3": {"key": "active", "type": "boolean"}}, + ] + } + ] + return retriever + + +@pytest.fixture +def mock_schema_type_identifier(): + return SchemaTypeIdentifier( + schema_pointer=["schema"], + key_pointer=["key"], + type_pointer=["type"], + is_nullable=True, + types_map=[], + parameters={}, + ) + + +@pytest.fixture +def dynamic_schema_loader(mock_retriever, mock_schema_type_identifier): + config = MagicMock() + parameters = {} + return DynamicSchemaLoader( + retriever=mock_retriever, + config=config, + parameters=parameters, + schema_type_identifier=mock_schema_type_identifier, + ) + + +@pytest.mark.parametrize( + "retriever_data, expected_schema", + [ + ( + # Test case: All fields with valid types + [ + { + "schema": [ + {"key": "name", "type": "string"}, + {"key": "age", "type": "integer"}, + ] + } + ], + { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "name": {"type": ["null", "string"]}, + "age": {"type": ["null", "integer"]}, + }, + }, + ), + ( + # Test case: Fields with missing type default to "string" + [ + { + "schema": [ + {"key": "name"}, + {"key": "email", "type": "string"}, + ] + } + ], + { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "name": {"type": ["null", "string"]}, + "email": {"type": ["null", "string"]}, + }, + }, + ), + ( + # Test case: Fields with nested types + [ + { + "schema": [ + {"key": "address", "type": ["string", "integer"]}, + ] + } + ], + { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "address": { + "oneOf": [{"type": ["null", "string"]}, {"type": ["null", "integer"]}] + }, + }, + }, + ), + ( + # Test case: Empty record set + [], + { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": {}, + }, + ), + ], +) +def test_dynamic_schema_loader(dynamic_schema_loader, retriever_data, expected_schema): + dynamic_schema_loader.retriever.read_records.return_value = retriever_data + + schema = dynamic_schema_loader.get_json_schema() + + # Validate the generated schema + assert schema == expected_schema + + +def test_dynamic_schema_loader_invalid_key(dynamic_schema_loader): + # Test case: Invalid key type + dynamic_schema_loader.retriever.read_records.return_value = [ + {"schema": [{"field1": {"key": 123, "type": "string"}}]} + ] + + with pytest.raises(ValueError, match="Expected key to be a string"): + dynamic_schema_loader.get_json_schema() + + +def test_dynamic_schema_loader_invalid_type(dynamic_schema_loader): + # Test case: Invalid type + dynamic_schema_loader.retriever.read_records.return_value = [ + {"schema": [{"field1": {"key": "name", "type": "invalid_type"}}]} + ] + + with pytest.raises(ValueError, match="Expected key to be a string. Got None"): + dynamic_schema_loader.get_json_schema() From 520998aea629a0c5d0f9857b7fd1af65aba6e24b Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 3 Dec 2024 04:49:42 +0100 Subject: [PATCH 18/25] Revert imports --- airbyte_cdk/models/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/airbyte_cdk/models/__init__.py b/airbyte_cdk/models/__init__.py index c56df9adc..1105cbeda 100644 --- a/airbyte_cdk/models/__init__.py +++ b/airbyte_cdk/models/__init__.py @@ -39,7 +39,9 @@ FailureType, Level, OAuthConfigSpecification, + OauthConnectorInputSpecification, OrchestratorType, + State, Status, StreamDescriptor, SyncMode, From 738713157a993d7bb5274123ec61ca105bd84065 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Tue, 3 Dec 2024 03:52:14 +0000 Subject: [PATCH 19/25] Auto-fix lint and format issues --- .../models/declarative_component_schema.py | 84 +++++++------------ 1 file changed, 32 insertions(+), 52 deletions(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index d16a8c3bb..5ad01dd9e 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -528,9 +528,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -902,28 +900,24 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( - Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{my_var}`.\n- The nested resolution variables like `{{my_nested_var}}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {base64Encoder:{my_var_a}:{my_var_b}}\n + base64Decorer - decode from `base64` encoded string, {base64Decoder:{my_string_variable_or_string_value}}\n + urlEncoder - encode the input string to URL-like format, {urlEncoder:https://test.host.com/endpoint}\n + urlDecorer - decode the input url-encoded string into text format, {urlDecoder:https%3A%2F%2Fairbyte.io}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {codeChallengeS256:{state_value}}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{client_id_key}={{client_id_key}}&{redirect_uri_key}={urlEncoder:{{redirect_uri_key}}}&{state_key}={{state_key}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{auth_code_key}": "{{auth_code_key}}",\n "{client_id_key}": "{{client_id_key}}",\n "{client_secret_key}": "{{client_secret_key}}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", - ) + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{my_var}`.\n- The nested resolution variables like `{{my_nested_var}}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {base64Encoder:{my_var_a}:{my_var_b}}\n + base64Decorer - decode from `base64` encoded string, {base64Decoder:{my_string_variable_or_string_value}}\n + urlEncoder - encode the input string to URL-like format, {urlEncoder:https://test.host.com/endpoint}\n + urlDecorer - decode the input url-encoded string into text format, {urlDecoder:https%3A%2F%2Fairbyte.io}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {codeChallengeS256:{state_value}}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{client_id_key}={{client_id_key}}&{redirect_uri_key}={urlEncoder:{{redirect_uri_key}}}&{state_key}={{state_key}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{auth_code_key}": "{{auth_code_key}}",\n "{client_id_key}": "{{client_id_key}}",\n "{client_secret_key}": "{{client_secret_key}}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -941,9 +935,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1565,25 +1557,21 @@ class Config: description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( - Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", - ) - ) - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) - schema_loader: Optional[ - Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] - ] = Field( - None, - description="Component used to retrieve the schema for the current stream.", - title="Schema Loader", + schema_loader: Optional[Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader]] = ( + Field( + None, + description="Component used to retrieve the schema for the current stream.", + title="Schema Loader", + ) ) transformations: Optional[ List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] @@ -1812,11 +1800,7 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -1885,11 +1869,7 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], From c6dcbc887b4bff42be161ccceb8f4f969b7ffa8c Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 3 Dec 2024 04:54:18 +0100 Subject: [PATCH 20/25] Updated version for manifest in unit tests --- .../declarative/resolvers/test_http_components_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py index 08a8e8c63..9e9fe225a 100644 --- a/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py +++ b/unit_tests/sources/declarative/resolvers/test_http_components_resolver.py @@ -25,7 +25,7 @@ _CONFIG = {"start_date": "2024-07-01T00:00:00.000Z"} _MANIFEST = { - "version": "5.0.0", + "version": "6.7.0", "type": "DeclarativeSource", "check": {"type": "CheckStream", "stream_names": ["Rates"]}, "dynamic_streams": [ From 890eec1da246266ec2861cbc1bbedfd0fb3dd04d Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 3 Dec 2024 05:05:06 +0100 Subject: [PATCH 21/25] Added details to ComponentMappingDefinition doc string --- .../sources/declarative/resolvers/components_resolver.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py index 54ca83f54..d4eb01788 100644 --- a/airbyte_cdk/sources/declarative/resolvers/components_resolver.py +++ b/airbyte_cdk/sources/declarative/resolvers/components_resolver.py @@ -14,7 +14,9 @@ @dataclass(frozen=True) class ComponentMappingDefinition: - """Defines the key-value mapping configuration for a stream component.""" + """Defines the configuration for mapping a component in a stream. This class specifies + what field in the stream template should be updated with value, supporting dynamic interpolation + and type enforcement.""" field_path: List["InterpolatedString"] value: Union["InterpolatedString", str] @@ -24,7 +26,9 @@ class ComponentMappingDefinition: @dataclass(frozen=True) class ResolvedComponentMappingDefinition: - """Represents a parsed and resolved component mapping for a stream configuration.""" + """Defines resolved configuration for mapping a component in a stream. This class specifies + what field in the stream template should be updated with value, supporting dynamic interpolation + and type enforcement.""" field_path: List["InterpolatedString"] value: "InterpolatedString" From 807d23e2df35982269a1e3264b8edec260dedaa3 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 3 Dec 2024 05:16:35 +0100 Subject: [PATCH 22/25] Fix edge case validation --- .../sources/declarative/schema/dynamic_schema_loader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py index 733d9a5f7..f2c906bf3 100644 --- a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +++ b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py @@ -173,9 +173,10 @@ def _replace_type_if_not_valid(self, field_type: str) -> str: """ Replaces a field type if it matches a type mapping in `types_map`. """ - for types_pair in self.schema_type_identifier.types_map: - if field_type == types_pair.current_type: - return types_pair.target_type + if self.schema_type_identifier.types_map: + for types_pair in self.schema_type_identifier.types_map: + if field_type == types_pair.current_type: + return types_pair.target_type return field_type @staticmethod @@ -187,6 +188,7 @@ def _make_field_nullable( """ if is_nullable: + field_type = deepcopy(field_type) field_type["type"] = ["null", field_type["type"]] return field_type From 59c5c7fd1740ea375fc70252f59af48c75756a17 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 3 Dec 2024 05:44:22 +0100 Subject: [PATCH 23/25] Fix mypy --- .../models/declarative_component_schema.py | 84 +++++++------------ .../schema/dynamic_schema_loader.py | 36 +++++--- 2 files changed, 55 insertions(+), 65 deletions(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index d16a8c3bb..5ad01dd9e 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -528,9 +528,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -902,28 +900,24 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( - Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{my_var}`.\n- The nested resolution variables like `{{my_nested_var}}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {base64Encoder:{my_var_a}:{my_var_b}}\n + base64Decorer - decode from `base64` encoded string, {base64Decoder:{my_string_variable_or_string_value}}\n + urlEncoder - encode the input string to URL-like format, {urlEncoder:https://test.host.com/endpoint}\n + urlDecorer - decode the input url-encoded string into text format, {urlDecoder:https%3A%2F%2Fairbyte.io}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {codeChallengeS256:{state_value}}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{client_id_key}={{client_id_key}}&{redirect_uri_key}={urlEncoder:{{redirect_uri_key}}}&{state_key}={{state_key}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{auth_code_key}": "{{auth_code_key}}",\n "{client_id_key}": "{{client_id_key}}",\n "{client_secret_key}": "{{client_secret_key}}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", - ) + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{my_var}`.\n- The nested resolution variables like `{{my_nested_var}}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {base64Encoder:{my_var_a}:{my_var_b}}\n + base64Decorer - decode from `base64` encoded string, {base64Decoder:{my_string_variable_or_string_value}}\n + urlEncoder - encode the input string to URL-like format, {urlEncoder:https://test.host.com/endpoint}\n + urlDecorer - decode the input url-encoded string into text format, {urlDecoder:https%3A%2F%2Fairbyte.io}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {codeChallengeS256:{state_value}}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{client_id_key}={{client_id_key}}&{redirect_uri_key}={urlEncoder:{{redirect_uri_key}}}&{state_key}={{state_key}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{auth_code_key}": "{{auth_code_key}}",\n "{client_id_key}": "{{client_id_key}}",\n "{client_secret_key}": "{{client_secret_key}}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -941,9 +935,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1565,25 +1557,21 @@ class Config: description="Component used to coordinate how records are extracted across stream slices and request pages.", title="Retriever", ) - incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( - Field( - None, - description="Component used to fetch data incrementally based on a time field in the data.", - title="Incremental Sync", - ) - ) - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = Field( + None, + description="Component used to fetch data incrementally based on a time field in the data.", + title="Incremental Sync", ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) - schema_loader: Optional[ - Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] - ] = Field( - None, - description="Component used to retrieve the schema for the current stream.", - title="Schema Loader", + schema_loader: Optional[Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader]] = ( + Field( + None, + description="Component used to retrieve the schema for the current stream.", + title="Schema Loader", + ) ) transformations: Optional[ List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] @@ -1812,11 +1800,7 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -1885,11 +1869,7 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], diff --git a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py index f2c906bf3..a4b6239e1 100644 --- a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +++ b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py @@ -65,12 +65,12 @@ class SchemaTypeIdentifier: key_pointer: List[Union[InterpolatedString, str]] parameters: InitVar[Mapping[str, Any]] type_pointer: Optional[List[Union[InterpolatedString, str]]] = None - types_map: List[TypesPair] = None + types_map: Optional[List[TypesPair]] = None is_nullable: bool = True def __post_init__(self, parameters: Mapping[str, Any]) -> None: - self.schema_pointer = self._update_pointer(self.schema_pointer, parameters) - self.key_pointer = self._update_pointer(self.key_pointer, parameters) + self.schema_pointer = self._update_pointer(self.schema_pointer, parameters) # type: ignore[assignment] # This is reqired field in model + self.key_pointer = self._update_pointer(self.key_pointer, parameters) # type: ignore[assignment] # This is reqired field in model self.type_pointer = ( self._update_pointer(self.type_pointer, parameters) if self.type_pointer else None ) @@ -110,7 +110,8 @@ def get_json_schema(self) -> Mapping[str, Any]: properties = {} for retrieved_record in self.retriever.read_records({}): raw_schema = self._extract_data( - retrieved_record, self.schema_type_identifier.schema_pointer + retrieved_record, # type: ignore[arg-type] # Expected that retrieved_record will be only Mapping[str, Any] + self.schema_type_identifier.schema_pointer, ) for property_definition in raw_schema: key = self._get_key(property_definition, self.schema_type_identifier.key_pointer) @@ -167,9 +168,16 @@ def _get_type( self._get_airbyte_type(mapped_field_type[1]), is_nullable ) return {"oneOf": [first_type, second_type]} - return self._make_field_nullable(self._get_airbyte_type(mapped_field_type), is_nullable) + elif isinstance(mapped_field_type, str): + return self._make_field_nullable(self._get_airbyte_type(mapped_field_type), is_nullable) + else: + raise ValueError( + f"Invalid data type. Available string or two items list of string. Got {mapped_field_type}." + ) - def _replace_type_if_not_valid(self, field_type: str) -> str: + def _replace_type_if_not_valid( + self, field_type: Union[List[str], str] + ) -> Union[List[str], str]: """ Replaces a field type if it matches a type mapping in `types_map`. """ @@ -186,11 +194,10 @@ def _make_field_nullable( """ Wraps a field type to allow null values if `is_nullable` is True. """ - + updated_field_type = dict(deepcopy(field_type)) if is_nullable: - field_type = deepcopy(field_type) - field_type["type"] = ["null", field_type["type"]] - return field_type + updated_field_type["type"] = ["null", updated_field_type["type"]] + return updated_field_type @staticmethod def _get_airbyte_type(field_type: str) -> Mapping[str, Any]: @@ -215,11 +222,14 @@ def _extract_data( if len(extraction_path) == 0: return body - path = [path.eval(self.config) for path in extraction_path] + path = [ + path.eval(self.config) if not isinstance(path, str) else path + for path in extraction_path + ] if "*" in path: - extracted = dpath.values(body, path) + extracted = dpath.values(body, path) # type: ignore # extracted will be a MutableMapping, given input data structure else: - extracted = dpath.get(body, path, default=default) + extracted = dpath.get(body, path, default=default) # type: ignore # extracted will be a MutableMapping, given input data structure return extracted From 227325f670cc37f1a981ae953cffc05ba2859fa9 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 3 Dec 2024 18:10:30 +0100 Subject: [PATCH 24/25] Update after review --- .../declarative_component_schema.yaml | 20 ++--- .../models/declarative_component_schema.py | 15 ++-- .../parsers/manifest_component_transformer.py | 2 +- .../parsers/model_to_component_factory.py | 17 ++-- .../sources/declarative/schema/__init__.py | 4 +- .../schema/dynamic_schema_loader.py | 84 ++++++++----------- .../schema/test_dynamic_schema_loader.py | 69 ++++++++------- 7 files changed, 93 insertions(+), 118 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 52d7fb2ad..17efa41f0 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1663,8 +1663,8 @@ definitions: $parameters: type: object additionalProperties: true - TypesPair: - title: Types Pair + TypesMap: + title: Types Map description: (This component is experimental. Use at your own risk.) Represents a mapping between a current type and its corresponding target type. type: object required: @@ -1696,15 +1696,16 @@ definitions: enum: [SchemaTypeIdentifier] schema_pointer: title: Schema Path - description: List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector). + description: List of nested fields defining the schema field path to extract. Defaults to []. type: array + default: [] items: - type: string interpolation_content: - config key_pointer: title: Key Path - description: List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector). + description: List of potentially nested fields describing the full path of the field key to extract. type: array items: - type: string @@ -1712,21 +1713,16 @@ definitions: - config type_pointer: title: Type Path - description: List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector). + description: List of potentially nested fields describing the full path of the field type to extract. type: array items: - type: string interpolation_content: - config - is_nullable: - title: Is Nullable - description: Add null to defined field type. This field is automatically set by the CDK. - type: boolean - default: true - types_map: + types_mapping: type: array items: - - "$ref": "#/definitions/TypesPair" + - "$ref": "#/definitions/TypesMap" $parameters: type: object additionalProperties: true diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 5ad01dd9e..c1053c577 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -645,7 +645,7 @@ class HttpResponseFilter(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") -class TypesPair(BaseModel): +class TypesMap(BaseModel): target_type: Union[str, List[str]] current_type: Union[str, List[str]] @@ -654,25 +654,20 @@ class SchemaTypeIdentifier(BaseModel): type: Optional[Literal["SchemaTypeIdentifier"]] = None schema_pointer: List[str] = Field( ..., - description='List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector).', + description="List of nested fields defining the schema field path to extract. Defaults to [].", title="Schema Path", ) key_pointer: List[str] = Field( ..., - description='List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector).', + description="List of potentially nested fields describing the full path of the field key to extract.", title="Key Path", ) type_pointer: Optional[List[str]] = Field( None, - description='List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector).', + description="List of potentially nested fields describing the full path of the field type to extract.", title="Type Path", ) - is_nullable: Optional[bool] = Field( - True, - description="Add null to defined field type. This field is automatically set by the CDK.", - title="Is Nullable", - ) - types_map: Optional[List[TypesPair]] = None + types_mapping: Optional[List[TypesMap]] = None parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py b/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py index 8cea2a825..8b1798229 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py @@ -61,7 +61,7 @@ # DynamicSchemaLoader "DynamicSchemaLoader.retriever": "SimpleRetriever", # SchemaTypeIdentifier - "SchemaTypeIdentifier.types_map": "TypesPair", + "SchemaTypeIdentifier.types_map": "TypesMap", } # We retain a separate registry for custom components to automatically insert the type if it is missing. This is intended to diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index fb3d0f0af..1ac449407 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -290,7 +290,7 @@ SubstreamPartitionRouter as SubstreamPartitionRouterModel, ) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - TypesPair as TypesPairModel, + TypesMap as TypesMapModel, ) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( @@ -357,7 +357,7 @@ InlineSchemaLoader, JsonFileSchemaLoader, SchemaTypeIdentifier, - TypesPair, + TypesMap, ) from airbyte_cdk.sources.declarative.spec import Spec from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicer @@ -454,7 +454,7 @@ def _init_mappings(self) -> None: JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, SchemaTypeIdentifierModel: self.create_schema_type_identifier, - TypesPairModel: self.create_types_pair, + TypesMapModel: self.create_types_map, JwtAuthenticatorModel: self.create_jwt_authenticator, LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, ListPartitionRouterModel: self.create_list_partition_router, @@ -1568,18 +1568,18 @@ def create_inline_schema_loader( return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) @staticmethod - def create_types_pair(model: TypesPairModel, **kwargs: Any) -> TypesPair: - return TypesPair(target_type=model.target_type, current_type=model.current_type) + def create_types_map(model: TypesMapModel, **kwargs: Any) -> TypesMap: + return TypesMap(target_type=model.target_type, current_type=model.current_type) def create_schema_type_identifier( self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any ) -> SchemaTypeIdentifier: types_map = [] - if model.types_map: + if model.types_mapping: types_map.extend( [ self._create_component_from_model(types_pair, config=config) - for types_pair in model.types_map + for types_pair in model.types_mapping ] ) model_schema_pointer: List[Union[InterpolatedString, str]] = [ @@ -1590,14 +1590,11 @@ def create_schema_type_identifier( [x for x in model.type_pointer] if model.type_pointer else None ) - assert model.is_nullable is not None # for mypy - return SchemaTypeIdentifier( schema_pointer=model_schema_pointer, key_pointer=model_key_pointer, type_pointer=model_type_pointer, types_map=types_map, - is_nullable=model.is_nullable, parameters=model.parameters or {}, ) diff --git a/airbyte_cdk/sources/declarative/schema/__init__.py b/airbyte_cdk/sources/declarative/schema/__init__.py index f6b7cd918..5d2aed60e 100644 --- a/airbyte_cdk/sources/declarative/schema/__init__.py +++ b/airbyte_cdk/sources/declarative/schema/__init__.py @@ -6,6 +6,6 @@ from airbyte_cdk.sources.declarative.schema.inline_schema_loader import InlineSchemaLoader from airbyte_cdk.sources.declarative.schema.json_file_schema_loader import JsonFileSchemaLoader from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader -from airbyte_cdk.sources.declarative.schema.dynamic_schema_loader import DynamicSchemaLoader, TypesPair, SchemaTypeIdentifier +from airbyte_cdk.sources.declarative.schema.dynamic_schema_loader import DynamicSchemaLoader, TypesMap, SchemaTypeIdentifier -__all__ = ["JsonFileSchemaLoader", "DefaultSchemaLoader", "SchemaLoader", "InlineSchemaLoader", "DynamicSchemaLoader", "TypesPair", "SchemaTypeIdentifier"] +__all__ = ["JsonFileSchemaLoader", "DefaultSchemaLoader", "SchemaLoader", "InlineSchemaLoader", "DynamicSchemaLoader", "TypesMap", "SchemaTypeIdentifier"] diff --git a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py index a4b6239e1..b91a2558e 100644 --- a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +++ b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py @@ -16,36 +16,36 @@ from airbyte_cdk.sources.source import ExperimentalClassWarning from airbyte_cdk.sources.types import Config -AIRBYTE_DATA_TYPES = { - "string": {"type": "string"}, - "boolean": {"type": "boolean"}, - "date": {"type": "string", "format": "date"}, +AIRBYTE_DATA_TYPES: Mapping[str, Mapping[str, Any]] = { + "string": {"type": ["null", "string"]}, + "boolean": {"type": ["null", "boolean"]}, + "date": {"type": ["null", "string"], "format": "date"}, "timestamp_without_timezone": { - "type": "string", + "type": ["null", "string"], "format": "date-time", "airbyte_type": "timestamp_without_timezone", }, - "timestamp_with_timezone": {"type": "string", "format": "date-time"}, + "timestamp_with_timezone": {"type": ["null", "string"], "format": "date-time"}, "time_without_timezone": { - "type": "string", + "type": ["null", "string"], "format": "time", "airbyte_type": "time_without_timezone", }, "time_with_timezone": { - "type": "string", + "type": ["null", "string"], "format": "time", "airbyte_type": "time_with_timezone", }, - "integer": {"type": "integer"}, - "number": {"type": "number"}, - "array": {"type": "array"}, - "object": {"type": "object"}, + "integer": {"type": ["null", "integer"]}, + "number": {"type": ["null", "number"]}, + "array": {"type": ["null", "array"]}, + "object": {"type": ["null", "object"]}, } @deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) @dataclass(frozen=True) -class TypesPair: +class TypesMap: """ Represents a mapping between a current type and its corresponding target type. """ @@ -65,8 +65,7 @@ class SchemaTypeIdentifier: key_pointer: List[Union[InterpolatedString, str]] parameters: InitVar[Mapping[str, Any]] type_pointer: Optional[List[Union[InterpolatedString, str]]] = None - types_map: Optional[List[TypesPair]] = None - is_nullable: bool = True + types_map: Optional[List[TypesMap]] = None def __post_init__(self, parameters: Mapping[str, Any]) -> None: self.schema_pointer = self._update_pointer(self.schema_pointer, parameters) # type: ignore[assignment] # This is reqired field in model @@ -108,19 +107,24 @@ def get_json_schema(self) -> Mapping[str, Any]: Constructs a JSON Schema based on retrieved data. """ properties = {} - for retrieved_record in self.retriever.read_records({}): - raw_schema = self._extract_data( + retrieved_record = next(self.retriever.read_records({}), None) # type: ignore[call-overload] # read_records return Iterable data type + + raw_schema = ( + self._extract_data( retrieved_record, # type: ignore[arg-type] # Expected that retrieved_record will be only Mapping[str, Any] self.schema_type_identifier.schema_pointer, ) - for property_definition in raw_schema: - key = self._get_key(property_definition, self.schema_type_identifier.key_pointer) - value = self._get_type( - property_definition, - self.schema_type_identifier.type_pointer, - is_nullable=self.schema_type_identifier.is_nullable, - ) - properties[key] = value + if retrieved_record + else [] + ) + + for property_definition in raw_schema: + key = self._get_key(property_definition, self.schema_type_identifier.key_pointer) + value = self._get_type( + property_definition, + self.schema_type_identifier.type_pointer, + ) + properties[key] = value return { "$schema": "http://json-schema.org/draft-07/schema#", @@ -145,7 +149,6 @@ def _get_type( self, raw_schema: MutableMapping[str, Any], field_type_path: Optional[List[Union[InterpolatedString, str]]], - is_nullable: bool = True, ) -> Union[Mapping[str, Any], List[Mapping[str, Any]]]: """ Determines the JSON Schema type for a field, supporting nullable and combined types. @@ -161,15 +164,11 @@ def _get_type( and len(mapped_field_type) == 2 and all(isinstance(item, str) for item in mapped_field_type) ): - first_type = self._make_field_nullable( - self._get_airbyte_type(mapped_field_type[0]), is_nullable - ) - second_type = self._make_field_nullable( - self._get_airbyte_type(mapped_field_type[1]), is_nullable - ) + first_type = self._get_airbyte_type(mapped_field_type[0]) + second_type = self._get_airbyte_type(mapped_field_type[1]) return {"oneOf": [first_type, second_type]} elif isinstance(mapped_field_type, str): - return self._make_field_nullable(self._get_airbyte_type(mapped_field_type), is_nullable) + return self._get_airbyte_type(mapped_field_type) else: raise ValueError( f"Invalid data type. Available string or two items list of string. Got {mapped_field_type}." @@ -187,18 +186,6 @@ def _replace_type_if_not_valid( return types_pair.target_type return field_type - @staticmethod - def _make_field_nullable( - field_type: Mapping[str, Any], is_nullable: bool = True - ) -> Mapping[str, Any]: - """ - Wraps a field type to allow null values if `is_nullable` is True. - """ - updated_field_type = dict(deepcopy(field_type)) - if is_nullable: - updated_field_type["type"] = ["null", updated_field_type["type"]] - return updated_field_type - @staticmethod def _get_airbyte_type(field_type: str) -> Mapping[str, Any]: """ @@ -227,9 +214,4 @@ def _extract_data( for path in extraction_path ] - if "*" in path: - extracted = dpath.values(body, path) # type: ignore # extracted will be a MutableMapping, given input data structure - else: - extracted = dpath.get(body, path, default=default) # type: ignore # extracted will be a MutableMapping, given input data structure - - return extracted + return dpath.get(body, path, default=default) # type: ignore # extracted will be a MutableMapping, given input data structure diff --git a/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py b/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py index 04fcd0182..da0770141 100644 --- a/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py +++ b/unit_tests/sources/declarative/schema/test_dynamic_schema_loader.py @@ -30,7 +30,6 @@ def mock_schema_type_identifier(): schema_pointer=["schema"], key_pointer=["key"], type_pointer=["type"], - is_nullable=True, types_map=[], parameters={}, ) @@ -53,14 +52,16 @@ def dynamic_schema_loader(mock_retriever, mock_schema_type_identifier): [ ( # Test case: All fields with valid types - [ - { - "schema": [ - {"key": "name", "type": "string"}, - {"key": "age", "type": "integer"}, - ] - } - ], + iter( + [ + { + "schema": [ + {"key": "name", "type": "string"}, + {"key": "age", "type": "integer"}, + ] + } + ] + ), { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", @@ -72,14 +73,16 @@ def dynamic_schema_loader(mock_retriever, mock_schema_type_identifier): ), ( # Test case: Fields with missing type default to "string" - [ - { - "schema": [ - {"key": "name"}, - {"key": "email", "type": "string"}, - ] - } - ], + iter( + [ + { + "schema": [ + {"key": "name"}, + {"key": "email", "type": "string"}, + ] + } + ] + ), { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", @@ -91,13 +94,15 @@ def dynamic_schema_loader(mock_retriever, mock_schema_type_identifier): ), ( # Test case: Fields with nested types - [ - { - "schema": [ - {"key": "address", "type": ["string", "integer"]}, - ] - } - ], + iter( + [ + { + "schema": [ + {"key": "address", "type": ["string", "integer"]}, + ] + } + ] + ), { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", @@ -110,7 +115,7 @@ def dynamic_schema_loader(mock_retriever, mock_schema_type_identifier): ), ( # Test case: Empty record set - [], + iter([]), { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", @@ -120,7 +125,7 @@ def dynamic_schema_loader(mock_retriever, mock_schema_type_identifier): ], ) def test_dynamic_schema_loader(dynamic_schema_loader, retriever_data, expected_schema): - dynamic_schema_loader.retriever.read_records.return_value = retriever_data + dynamic_schema_loader.retriever.read_records = MagicMock(return_value=retriever_data) schema = dynamic_schema_loader.get_json_schema() @@ -130,9 +135,9 @@ def test_dynamic_schema_loader(dynamic_schema_loader, retriever_data, expected_s def test_dynamic_schema_loader_invalid_key(dynamic_schema_loader): # Test case: Invalid key type - dynamic_schema_loader.retriever.read_records.return_value = [ - {"schema": [{"field1": {"key": 123, "type": "string"}}]} - ] + dynamic_schema_loader.retriever.read_records.return_value = iter( + [{"schema": [{"field1": {"key": 123, "type": "string"}}]}] + ) with pytest.raises(ValueError, match="Expected key to be a string"): dynamic_schema_loader.get_json_schema() @@ -140,9 +145,9 @@ def test_dynamic_schema_loader_invalid_key(dynamic_schema_loader): def test_dynamic_schema_loader_invalid_type(dynamic_schema_loader): # Test case: Invalid type - dynamic_schema_loader.retriever.read_records.return_value = [ - {"schema": [{"field1": {"key": "name", "type": "invalid_type"}}]} - ] + dynamic_schema_loader.retriever.read_records.return_value = iter( + [{"schema": [{"field1": {"key": "name", "type": "invalid_type"}}]}] + ) with pytest.raises(ValueError, match="Expected key to be a string. Got None"): dynamic_schema_loader.get_json_schema() From 05e4f74e88388a1316703045b425ee1c0e8637d3 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Tue, 3 Dec 2024 18:20:05 +0100 Subject: [PATCH 25/25] Add default value for schema pointer --- .../declarative/declarative_component_schema.yaml | 1 - .../declarative/models/declarative_component_schema.py | 4 ++-- .../declarative/parsers/model_to_component_factory.py | 6 +++--- .../declarative/schema/dynamic_schema_loader.py | 10 ++++++---- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index b2e1a66c1..4cea78465 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1701,7 +1701,6 @@ definitions: description: (This component is experimental. Use at your own risk.) Identifies schema details for dynamic schema extraction and processing. type: object required: - - schema_pointer - key_pointer properties: type: diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 4e3f26ca9..7a9bd9cda 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -657,8 +657,8 @@ class TypesMap(BaseModel): class SchemaTypeIdentifier(BaseModel): type: Optional[Literal["SchemaTypeIdentifier"]] = None - schema_pointer: List[str] = Field( - ..., + schema_pointer: Optional[List[str]] = Field( + [], description="List of nested fields defining the schema field path to extract. Defaults to [].", title="Schema Path", ) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index f92c4d416..f8f62eeb5 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1593,9 +1593,9 @@ def create_schema_type_identifier( for types_pair in model.types_mapping ] ) - model_schema_pointer: List[Union[InterpolatedString, str]] = [ - x for x in model.schema_pointer - ] + model_schema_pointer: List[Union[InterpolatedString, str]] = ( + [x for x in model.schema_pointer] if model.schema_pointer else [] + ) model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( [x for x in model.type_pointer] if model.type_pointer else None diff --git a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py index b91a2558e..4b6e9e864 100644 --- a/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +++ b/airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py @@ -61,14 +61,16 @@ class SchemaTypeIdentifier: Identifies schema details for dynamic schema extraction and processing. """ - schema_pointer: List[Union[InterpolatedString, str]] key_pointer: List[Union[InterpolatedString, str]] parameters: InitVar[Mapping[str, Any]] type_pointer: Optional[List[Union[InterpolatedString, str]]] = None types_map: Optional[List[TypesMap]] = None + schema_pointer: Optional[List[Union[InterpolatedString, str]]] = None def __post_init__(self, parameters: Mapping[str, Any]) -> None: - self.schema_pointer = self._update_pointer(self.schema_pointer, parameters) # type: ignore[assignment] # This is reqired field in model + self.schema_pointer = ( + self._update_pointer(self.schema_pointer, parameters) if self.schema_pointer else [] + ) # type: ignore[assignment] # This is reqired field in model self.key_pointer = self._update_pointer(self.key_pointer, parameters) # type: ignore[assignment] # This is reqired field in model self.type_pointer = ( self._update_pointer(self.type_pointer, parameters) if self.type_pointer else None @@ -199,14 +201,14 @@ def _get_airbyte_type(field_type: str) -> Mapping[str, Any]: def _extract_data( self, body: Mapping[str, Any], - extraction_path: List[Union[InterpolatedString, str]], + extraction_path: Optional[List[Union[InterpolatedString, str]]] = None, default: Any = None, ) -> Any: """ Extracts data from the body based on the provided extraction path. """ - if len(extraction_path) == 0: + if not extraction_path: return body path = [